In [27]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, log_loss
import itertools

In [28]:
# Import data
df = pd.read_stata('/Users/nbs/Documents/Georgetown/Semester 5/1 Courses/GBUS 401/1 Project/gbus_401_project/Data_Final/gbus_401_project_master.dta')

# Extract variables of interest
df = df[['year', 'school_id', 'admit', 'gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl']]

# Drop observations with missing variables
df = df.dropna(axis='index') # Drop missing

# Convert year, school_id to dummies for TWFE
df_year = df[['year']]
df = pd.get_dummies(df, prefix=['y', 'sid'], columns=['year', 'school_id'], drop_first=True) # First column is dropped to prevent collinearity
df = df.join(df_year)

# Clean up
df = df.replace(['False', 'True'], [0, 1])

# Logit

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, accuracy_score


predicted = cross_validation.cross_val_predict(LogisticRegression(), X, y, cv=10)
print metrics.accuracy_score(y, predicted)

# Decision Tree

In [29]:
# Define features and outcome
y = df[['admit']]
X = df.drop(['admit'], axis=1)

In [30]:
# Split data into testing (25%) and training (75%)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Run decision tree
clf = DecisionTreeClassifier(random_state=0)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# Train decision tree using effective alphas
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

# Remove trivial tree with one node
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

# Plot maximum depth vs. alpha
max_depths = [clf.tree_.max_depth for clf in clfs]

fig1 = plt.figure(dpi=150)
plt.scatter(ccp_alphas, max_depths)
plt.plot(ccp_alphas,max_depths, drawstyle="steps-post")
plt.xlabel("Alpha")
plt.ylabel("Maximum Depth")
plt.title("Tree Depth Decreases as Alpha Increases")
plt.show()

# Plot accuracy vs. alpha
train_scores = [clf.score(X_train, y_train) for clf in clfs] # What is the score?
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots(dpi=150)
ax.set_xlabel("Alpha")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy and Alpha for Training and Testing Data")
ax.plot(ccp_alphas,train_scores,marker="o",label="Train",drawstyle="steps-post")
ax.plot(ccp_alphas,test_scores,marker="o",label="Test",drawstyle="steps-post")
ax.legend()
plt.show()

KeyboardInterrupt: 

# Model 1: Linear Regression

In [3]:
# Define features and outcome
y = df[['admit']]
X = df.drop(['admit'], axis=1)

In [4]:
# Define model
model = LinearRegression(n_jobs=-1)
model.fit(X, y)

# Predict admit
y_hat = np.array([i for i in model.predict(X)])

# Print outputs
print('Coefficients')
[print(a, ':', round(b, 3)) for a, b in zip(model.feature_names_in_[0:6], model.coef_.flatten()[0:6])]
print('')

print('Intercept:', round(model.intercept_.item(), 3), '\n')

print('Goodness of Fit')
print('Cross Entropy:', round(log_loss(y, y_hat), 3)) # issue caused by FE for some reason???
print('R^2', round(model.score(X, y), 3))
print('MSE:', round(mean_squared_error(y, y_hat), 3))

Coefficients
gpa : 0.372
lsat : 0.038
urm : 0.151
fee_waived : 0.047
non_trad : -0.012
intl : -0.055

Intercept: -531490359818.616 

Goodness of Fit
Cross Entropy: 0.436
R^2 0.445
MSE: 0.12


In [23]:
# Convert to time series
pd.to_datetime(df.year).dt.year
df.set_index('year', inplace=True)
df.sort_index(inplace=True)

# Define X and y
X = df.drop(labels=['admit'], axis=1)
y = df[['admit']]

tss = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)

In [7]:
model = LinearRegression(n_jobs=-1)
model.fit(X, y)

i = 1
for train_index, test_index in tss.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    print(f"Accuracy for the fold no. {i} on the test set: {model.score(X, y)}")

TRAIN: [    0     1     2 ... 62195 62196 62197] TEST: [ 62198  62199  62200 ... 124388 124389 124390]
Accuracy for the fold no. 1 on the test set: -1.1213709733116754e+21
TRAIN: [     0      1      2 ... 124388 124389 124390] TEST: [124391 124392 124393 ... 186581 186582 186583]
Accuracy for the fold no. 1 on the test set: -2.9152024465258257e+20
TRAIN: [     0      1      2 ... 186581 186582 186583] TEST: [186584 186585 186586 ... 248774 248775 248776]
Accuracy for the fold no. 1 on the test set: -1.7296321942309955e+20
TRAIN: [     0      1      2 ... 248774 248775 248776] TEST: [248777 248778 248779 ... 310967 310968 310969]
Accuracy for the fold no. 1 on the test set: -3.964797203474756e+19
TRAIN: [     0      1      2 ... 310967 310968 310969] TEST: [310970 310971 310972 ... 373160 373161 373162]
Accuracy for the fold no. 1 on the test set: 0.4358899400976498


In [25]:
from sklearn.model_selection import cross_validate

X = df[['gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl']]
y = df[['admit']]

model = LinearRegression(n_jobs=-1)
# tss = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)

scoring=('r2', 'neg_mean_squared_error')

cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)
sorted(cv_results.keys())

##

['fit_time', 'score_time', 'test_neg_mean_squared_error', 'test_r2']

In [26]:
cv_results['test_r2']

array([ 0.03932869,  0.06656639, -0.04264215,  0.02885161, -0.05557259])

In [71]:
from sklearn.model_selection import cross_val_score
lr = LinearRegression()
np.mean(cross_val_score(lr, X, y, cv=3))

-3.73767919253277e+20