In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, log_loss
import itertools

In [2]:
# Import data
df0 = pd.read_stata('/Users/nbs/Documents/Georgetown/Semester 5/1 Courses/GBUS 401/1 Project/gbus_401_project/Data_Final/gbus_401_project_master.dta')

# Clean up
df = df0.replace(['False', 'True'], [0, 1])

# Convert year, school_id to dummies for TWFE
fe_catvars = df[['year', 'school_id']]
df = pd.get_dummies(df, prefix=['y', 'sid'], columns=['year', 'school_id'], drop_first=True) # First column is dropped to prevent collinearity
df = df.join(fe_catvars)

# Model 1: Linear Regression

In [40]:
# List of variables to use
varlist = []
varlist = ['admit', 'gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl', 'year']

# Fixed-effect dummies
for i in list(df.columns):
    if ('sid_' in i) or ('y_' in i):
        varlist.append(i)

# Define dataset for Model 1
df1 = df[varlist]
df1 = df1.dropna(axis='index') # Drop missing

# Define features and outcome
y = df1[['admit']]
X = df1.drop(['admit', 'year'], axis=1)

In [36]:
# Define model
model = LinearRegression(n_jobs=-1)
model.fit(X, y)

# Predict admit
y_hat = np.array([i for i in model.predict(X)])

# Print outputs
print('Coefficients')
[print(a, ':', round(b, 3)) for a, b in zip(model.feature_names_in_[0:6], model.coef_.flatten()[0:6])]
print('')

print('Intercept:', round(model.intercept_.item(), 3), '\n')

print('Goodness of Fit')
print('Cross Entropy:', round(log_loss(y, y_hat), 3)) # issue caused by FE for some reason???
print('R^2', round(model.score(X, y), 3))
print('MSE:', round(mean_squared_error(y, y_hat), 3))

Coefficients
gpa : 0.372
lsat : 0.038
urm : 0.151
fee_waived : 0.047
non_trad : -0.012
intl : -0.055

Intercept: -6.04 

Goodness of Fit
Cross Entropy: 0.436
R^2 0.445
MSE: 0.12


In [12]:

features = ['gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl']

# Get all combinations of features to test
feature_combos = []
for j in range(1, len(features) + 1):
    feature_combos.append(list(itertools.combinations(features, j)))
feature_combos = list(itertools.chain.from_iterable(feature_combos))
#print(list(feature_combos), sep='/n')

results = ''.join(str(item) for item in feature_combos)
print(results)

('gpa',)('lsat',)('urm',)('fee_waived',)('non_trad',)('intl',)('gpa', 'lsat')('gpa', 'urm')('gpa', 'fee_waived')('gpa', 'non_trad')('gpa', 'intl')('lsat', 'urm')('lsat', 'fee_waived')('lsat', 'non_trad')('lsat', 'intl')('urm', 'fee_waived')('urm', 'non_trad')('urm', 'intl')('fee_waived', 'non_trad')('fee_waived', 'intl')('non_trad', 'intl')('gpa', 'lsat', 'urm')('gpa', 'lsat', 'fee_waived')('gpa', 'lsat', 'non_trad')('gpa', 'lsat', 'intl')('gpa', 'urm', 'fee_waived')('gpa', 'urm', 'non_trad')('gpa', 'urm', 'intl')('gpa', 'fee_waived', 'non_trad')('gpa', 'fee_waived', 'intl')('gpa', 'non_trad', 'intl')('lsat', 'urm', 'fee_waived')('lsat', 'urm', 'non_trad')('lsat', 'urm', 'intl')('lsat', 'fee_waived', 'non_trad')('lsat', 'fee_waived', 'intl')('lsat', 'non_trad', 'intl')('urm', 'fee_waived', 'non_trad')('urm', 'fee_waived', 'intl')('urm', 'non_trad', 'intl')('fee_waived', 'non_trad', 'intl')('gpa', 'lsat', 'urm', 'fee_waived')('gpa', 'lsat', 'urm', 'non_trad')('gpa', 'lsat', 'urm', 'intl

In [1]:
# All possible features
features = ['gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl']

# Get all combinations of features to test
feature_combos = []
for j in range(1, len(features) + 1):
    feature_combos.append(list(itertools.combinations(features, j)))
feature_combos = list(itertools.chain.from_iterable(feature_combos))
#print(type(list(feature_combos[0])))

# Get year and FEs
fes = df1.columns[7:].values.tolist()
#print(type(fes))

# Get year
y = ['admit']
#print(type(y))

NameError: name 'itertools' is not defined

In [30]:
# Cross validation; cite: # https://stackoverflow.com/questions/58069691/how-to-create-a-train-test-split-of-time-series-data-by-year

mr2s = []
mentropies = []

for i in feature_combos:

    # Define data set
    i = list(i)

    flist = []

    for j in i:
        flist.append(i)

    for j in fes:
        flist.append(j)
    
    for j in y:
        flist.append(j)

    #print(flist)
    dfi = df1[flist]

    # Define X and y
    y = dfi[['admit']]
    X = dfi.drop(['admit', 'year'], axis=1)
    
    # Split data set
    year_list = sorted(dfi['year'].unique())
    splits = {'train': [], 'test': []}

    for j, year in enumerate(year_list[:-1]):

        train_year = year_list[:j + 1]
        test_year = [year_list[j + 1]]

        #print('Train:', train_year, 'Test:',test_year)
        
        splits['train'].append(dfi.loc[dfi.year.isin(train_year), :])
        splits['test'].append(dfi.loc[dfi.year.isin(test_year), :])

    # Estimate test statistics

    r2s = []
    entropies = []

    for k in range(len(year_list) - 1):

        X_train, X_test = splits['train'][k], splits['test'][k]
        y_train, y_test = splits['train'][k][['admit']], splits['test'][k][['admit']]

        model.fit(X_train, y_train)

        r2 = model.score(X_test, y_test)
        r2s.append(r2)

        y_test_hat = model.predict(X_test)
        #entropy = log_loss(y, y_test_hat) # WHY WON'T THIS WORK?????
        #entropies.append(entropy)

    mr2s.append(np.mean(r2s))
    #mentropies.append(np.mean(entropies))

In [37]:
print(mr2s)
print(np.max(mr2s))
print(mr2s.index(np.max(mr2s)))

# Clearly a mistake. What happened?

[0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396, 0.9997913849425396,

In [37]:
# TRIAL 2




# All possible features
features = ['gpa', 'lsat']

# Get all combinations of features to test
feature_combos = []
for j in range(0, len(features) + 1):
    feature_combos.append(list(itertools.combinations(features, j)))
feature_combos = list(itertools.chain.from_iterable(feature_combos))
feature_combos.remove(feature_combos[0])
print(feature_combos)

# Get year and FEs
fes = df1.columns[7:].values.tolist()
#print(type(fes))

# Split data set
year_list = sorted(df1['year'].unique())
splits = {'train': [], 'test': []}

for j, year in enumerate(year_list[:-1]):

    train_year = year_list[:j+1]
    test_year = [year_list[j+1]]

    print('Train:', train_year, 'Test:',test_year)
    
    splits['train'].append(df1.loc[df1.year.isin(train_year), :])
    splits['test'].append(df1.loc[df1.year.isin(test_year), :])

print(type(splits))

[('gpa',), ('lsat',), ('gpa', 'lsat')]
Train: [2004] Test: [2005]
Train: [2004, 2005] Test: [2006]
Train: [2004, 2005, 2006] Test: [2007]
Train: [2004, 2005, 2006, 2007] Test: [2008]
Train: [2004, 2005, 2006, 2007, 2008] Test: [2009]
Train: [2004, 2005, 2006, 2007, 2008, 2009] Test: [2010]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010] Test: [2011]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011] Test: [2012]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012] Test: [2013]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013] Test: [2014]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014] Test: [2015]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015] Test: [2016]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016] Test: [2017]
Train: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017] Test: [2018]
Train: [2004, 2005, 2006, 2007, 2008,

In [33]:
a = splits['test'][5]['admit']
print(type(a))
print(len(a.index))

<class 'pandas.core.series.Series'>
22273


In [54]:
print(df1.columns)

features = df1.columns.tolist()
print(features[0:7])

print(r2s)

Index(['admit', 'gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl', 'year',
       'y_2005', 'y_2006',
       ...
       'sid_3091300', 'sid_3173300', 'sid_3191300', 'sid_3374300',
       'sid_3559300', 'sid_3691400', 'sid_4096300', 'sid_4131400',
       'sid_4143500', 'sid_4242101'],
      dtype='object', length=228)
['admit', 'gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl']
[-9.598488531963925e+27, -3.914678749758287e+24, -1.5986446878362752e+24, -1.1991572161029628e+23, -6.937396698779049e+21, -1.519038839214575e+23, -9.107016786707644e+17, -3.3601078338024474e+21, 0.9999999924702316, 0.9999327515123093, 0.08549638677558768, 0.08549638677558768, 0.05451987464824737, 0.054085747022119435, 0.062288381402391235, 0.06648377830882635, 0.07122904437369748, 0.05944998432631954, 0.000717859316780034, -0.05036247072189304, -0.04214531827431278, -0.03983454941279341, 0.0029376016968991614, 0.033368426574371, 0.03250308732160723, 0.05525921034918568, 0.045262171490364045, -0.0783869

In [52]:
# Cross validation; cite: # https://stackoverflow.com/questions/58069691/how-to-create-a-train-test-split-of-time-series-data-by-year

mr2s = []
mentropies = []

model = LinearRegression(n_jobs=-1)

for j in range(len(year_list) - 1):

    X_train, X_test = splits['train'][j][features[1:7]], splits['test'][j][features[1:7]]

    y_train, y_test = splits['train'][j][features[0]], splits['test'][j][features[0]]

    model.fit(X_train, y_train)

    r2 = model.score(X_test, y_test)

    r2s.append(r2)

    y_test_hat = model.predict(X_test)
    #ntropy = log_loss(y, y_test_hat) # WHY WON'T THIS WORK?????
    #entropies.append(entropy)

mr2s.append(np.mean(r2s))
print('Features:', 'Mean R^2:', np.mean(r2s))
#mentropies.append(np.mean(entropies))

Features: school_id Mean R^2: -1.9600579537596502e+26


# Model 3: Decision Tree

See here for source: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py

In [None]:
# Split data into testing (25%) and training (75%)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Run decision tree
clf = DecisionTreeClassifier(random_state=0)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# Train decision tree using effective alphas
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

# Remove trivial tree with one node
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

# Plot maximum depth vs. alpha
max_depths = [clf.tree_.max_depth for clf in clfs]

fig1 = plt.figure(dpi=150)
plt.scatter(ccp_alphas, max_depths)
plt.plot(ccp_alphas,max_depths, drawstyle="steps-post")
plt.xlabel("Alpha")
plt.ylabel("Maximum Depth")
plt.title("Tree Depth Decreases as Alpha Increases")
plt.show()

# Plot accuracy vs. alpha
train_scores = [clf.score(X_train, y_train) for clf in clfs] # What is the score?
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots(dpi=150)
ax.set_xlabel("Alpha")
ax.set_ylabel("Accuracy")
ax.set_title("Accuracy and Alpha for Training and Testing Data")
ax.plot(ccp_alphas,train_scores,marker="o",label="Train",drawstyle="steps-post")
ax.plot(ccp_alphas,test_scores,marker="o",label="Test",drawstyle="steps-post")
ax.legend()
plt.show()