In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, log_loss

In [2]:
# Import data
df0 = pd.read_stata('/Users/nbs/Documents/Georgetown/Semester 5/1 Courses/GBUS 401/1 Project/gbus_401_project_master.dta')

# Clean up
df = df0.replace(['False', 'True'], [0, 1])

# Convert year, school_id to dummies for TWFE
fe_vars = df[['school_id', 'year']]
df = pd.get_dummies(df, prefix=['y', 'sid'], columns=['year', 'school_id'], drop_first=True) # First column is dropped to prevent collinearity
df = df.join(fe_vars)

# Model 1: Linear Regression

In [3]:
# List of variables to use
varlist = ['admit', 'gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl', 'year']

# Fixed-effect dummies
for i in list(df.columns):
    if ('sid_' in i) or ('y_' in i):
        varlist.append(i)

# Define dataset for Model 1
df1 = df[varlist]
df1 = df1.dropna(axis='index') # Drop missing

# Define features and outcome
y = df1[['admit']]
X = df1.drop(['admit', 'year'], axis=1)

In [4]:
# Define model
model = LinearRegression(n_jobs=-1)
model.fit(X, y)

# Predict admit
y_hat = np.array([i for i in model.predict(X)])

# Print outputs
print('Coefficients')
[print(a, ':', round(b, 3)) for a, b in zip(model.feature_names_in_[0:6], model.coef_.flatten()[0:6])]
print('')

print('Intercept:', round(model.intercept_.item(), 3), '\n')

print('Goodness of Fit')
print('Cross Entropy:', round(log_loss(y, y_hat), 3))
print('R^2', round(model.score(X, y), 3))
print('MSE:', round(mean_squared_error(y, y_hat), 3))

Coefficients
gpa : 0.372
lsat : 0.038
urm : 0.151
fee_waived : 0.047
non_trad : -0.012
intl : -0.055

Intercept: -6.04 

Goodness of Fit
Cross Entropy: 0.436
R^2 0.445
MSE: 0.12


In [5]:
# Cross validation; cite: # https://stackoverflow.com/questions/58069691/how-to-create-a-train-test-split-of-time-series-data-by-year

# Split data set
year_list = sorted(df1['year'].unique())
splits = {'train': [], 'test': []}

for i, year in enumerate(year_list[:-1]):

    train_year = year_list[:i + 1]
    test_year = [year_list[i + 1]]

    # print('Train:', train_year, 'Test:',test_year)
    
    splits['train'].append(df1.loc[df1.year.isin(train_year), :])
    splits['test'].append(df1.loc[df1.year.isin(test_year), :])

In [14]:
# Run on test and train data sets

model = LinearRegression(n_jobs=-1)

r2s = []
entropies = []

for i in range(len(year_list) - 1):

    X_train, X_test = splits['train'][i][varlist], splits['test'][i][varlist]
    y_train, y_test = splits['train'][i][['admit']], splits['test'][i][['admit']]

    model.fit(X_train, y_train)

    r2 = model.score(X_test, y_test)
    r2s.append(r2)

    print('X_train:',len(X_train))
    print('X_test:',len(X_test))
    y_test_hat = model.predict(X_test)
    print(y_test_hat)
    entropy = log_loss(y, y_test_hat)
    entropies.append(entropy)

    print(year_list[i])

X_train: 10556
X_test: 18298
[[1.00002058e+00]
 [1.00002058e+00]
 [1.00002058e+00]
 ...
 [2.05786571e-05]
 [2.05786571e-05]
 [2.05786571e-05]]


ValueError: Found input variables with inconsistent numbers of samples: [18298, 373163]

In [8]:
print('R^2:', round(np.mean(r2s), 3))
print('Cross Entropy:', round(np.mean(entropies), 3))

R^2: 0.559
Cross Entropy: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


# Model 3: Decision Tree

See here for source: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py

In [None]:
# Split data into testing (25%) and training (75%)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Run decision tree
clf = DecisionTreeClassifier(random_state=0)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# Train decision tree using effective alphas
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

# Remove trivial tree with one node
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

# Plot maximum depth vs. alpha
max_depths = [clf.tree_.max_depth for clf in clfs]

fig1 = plt.figure(dpi=150)
plt.scatter(ccp_alphas, max_depths)
plt.plot(ccp_alphas,max_depths, drawstyle="steps-post")
plt.xlabel("Alpha")
plt.ylabel("Maximum Depth")
plt.title("Tree Depth Decreases as Alpha Increases")
plt.show()

# Plot accuracy vs. alpha
train_scores = [clf.score(X_train, y_train) for clf in clfs] # What is the score?
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots(dpi=150)
ax.set_xlabel("Alpha")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy and Alpha for Training and Testing Data")
ax.plot(ccp_alphas,train_scores,marker="o",label="Train",drawstyle="steps-post")
ax.plot(ccp_alphas,test_scores,marker="o",label="Test",drawstyle="steps-post")
ax.legend()
plt.show()