In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, log_loss

In [None]:
# Import data
df0 = pd.read_stata('/Users/justinpotisit/Documents/GitHub/gbus_401_project/Data_Intermediate/gbus_401_project_master.dta')

# Clean up
df = df0.replace(['False', 'True'], [0, 1])

# Convert year, school_id to dummies for TWFE
fe_vars = df[['school_id', 'year']]
df = pd.get_dummies(df, prefix=['y', 'sid'], columns=['year', 'school_id'], drop_first=True) # First column is dropped to prevent collinearity
df = df.join(fe_vars)

# Model 1: Linear Regression

In [None]:
# List of variables to use
varlist = ['admit', 'gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl']

# Fixed-effect dummies
for i in list(df.columns):
    if ('sid_' in i) or ('y_' in i):
        varlist.append(i)

# Define dataset for Model 1
df1 = df[varlist]
df1 = df1.dropna(axis='index') # Drop missing

# Define features and outcome
X = df1.loc[:, df1.columns!='admit']
y = df1[['admit']]

In [None]:
# Define model
model = LinearRegression(n_jobs=-1).fit(X, y)

# Predict admit
y_hat = np.array([i for i in model.predict(X)])

# Print outputs
print('Coefficients')
[print(a, ':', round(b, 3)) for a, b in zip(model.feature_names_in_[0:6], model.coef_.flatten()[0:6])]
print('')

print('Intercept:', round(model.intercept_.item(), 3), '\n')

print('Goodness of Fit')
print('Cross Entropy:', round(log_loss(y, y_hat), 3))
print('R^2', round(model.score(X, y), 3))
print('MSE:', round(mean_squared_error(y, y_hat), 3))

Coefficients
gpa : 0.372
lsat : 0.038
urm : 0.151
fee_waived : 0.047
non_trad : -0.012
intl : -0.055

Intercept: -6.026 

Goodness of Fit
Cross Entropy: 0.436
R^2 0.445
MSE: 0.12


In [None]:
# Cross validation

year_list = sorted(df['year'].unique())
splits = {'train': [], 'test': []}

for i, year in enumerate(year_list[:-1]): # https://stackoverflow.com/questions/58069691/how-to-create-a-train-test-split-of-time-series-data-by-year

    train_year = year_list[:i + 1]
    test_year = [year_list[i + 1]]

    print('Train:', train_year, 'Test:',test_year)
    
    splits['train'].append(df.loc[df.year.isin(train_year), :])
    splits['test'].append(df.loc[df.year.isin(test_year), :])

Train: [2004] Test: [2005]
Train: [2004, 2005] Test: [2006]
Train: [2004, 2005, 2006] Test: [2007]
Train: [2004, 2005, 2006, 2007] Test: [2008]
Train: [2004, 2005, 2006, 2007, 2008] Test: [2009]
Train: [2004, 2005, 2006, 2007, 2008, 2009] Test: [2010]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010] Test: [2011]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011] Test: [2012]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012] Test: [2013]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013] Test: [2014]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014] Test: [2015]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015] Test: [2016]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016] Test: [2017]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017] Test: [2018]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 20

# Model 3: Decision Tree

See here for source: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py

In [None]:
# Split data into testing (25%) and training (75%)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Run decision tree
clf = DecisionTreeClassifier(random_state=0)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# Train decision tree using effective alphas
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

# Remove trivial tree with one node
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

# Plot maximum depth vs. alpha
max_depths = [clf.tree_.max_depth for clf in clfs]

fig1 = plt.figure(dpi=150)
plt.scatter(ccp_alphas, max_depths)
plt.plot(ccp_alphas,max_depths, drawstyle="steps-post")
plt.xlabel("Alpha")
plt.ylabel("Maximum Depth")
plt.title("Tree Depth Decreases as Alpha Increases")
plt.show()

# Plot accuracy vs. alpha
train_scores = [clf.score(X_train, y_train) for clf in clfs] # What is the score?
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots(dpi=150)
ax.set_xlabel("Alpha")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy and Alpha for Training and Testing Data")
ax.plot(ccp_alphas,train_scores,marker="o",label="Train",drawstyle="steps-post")
ax.plot(ccp_alphas,test_scores,marker="o",label="Test",drawstyle="steps-post")
ax.legend()
plt.show()