In [11]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, log_loss
import itertools
from sklearn.metrics import r2_score

In [12]:
# Import data
df = pd.read_stata('/Users/nbs/Documents/Georgetown/Semester 5/1 Courses/GBUS 401/1 Project/gbus_401_project/Data_Final/gbus_401_project_master.dta')

# Extract variables of interest
df = df[['year', 'school_id', 'admit', 'gpa', 'lsat', 'urm', 'fee_waived', 'non_trad', 'intl']]

# Drop missing observations
df = df.dropna(axis='index') # Drop missing
df = df[df.year > 2010]
df = df[df.year < 2023]

# Convert year, school_id to dummies for TWFE
df_year = df[['year']]
df = pd.get_dummies(df, prefix=['y', 'sid'], columns=['year', 'school_id'], drop_first=True) # First column is dropped to prevent collinearity
df = df.join(df_year)

# Clean up
df = df.replace(['False', 'True'], [0, 1])

# Linear Regression

In [13]:
# Define features and outcome
y = df[['admit']]
X = df.drop(['admit'], axis=1)

### Run Model on Full Data Set

In [14]:
# Define model
model = LinearRegression(n_jobs=-1)
model.fit(X, y)

# Predict admit
y_hat = np.array([i for i in model.predict(X)])

# Print outputs
print('Coefficients')
[print(a, ':', round(b, 3)) for a, b in zip(model.feature_names_in_[0:6], model.coef_.flatten()[0:6])]
print('')

print('Intercept:', round(model.intercept_.item(), 3), '\n')

print('Goodness of Fit')
print('Cross Entropy:', round(log_loss(y, y_hat), 3)) # issue caused by FE for some reason???
print('R^2', round(model.score(X, y), 3))
print('MSE:', round(mean_squared_error(y, y_hat), 3))

Coefficients
gpa : 0.376
lsat : 0.034
urm : 0.134
fee_waived : 0.054
non_trad : -0.018
intl : -0.068

Intercept: -2300349703.156 

Goodness of Fit
Cross Entropy: 0.411
R^2 0.45
MSE: 0.114


### Cross Validation

In [15]:
# Convert to time series
pd.to_datetime(df.year).dt.year
df.set_index('year', inplace=True)
df.sort_index(inplace=True)

# Define X and y
X = df.drop(labels=['admit'], axis=1)
y = df[['admit']]

tss = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)

In [16]:
model = LinearRegression(n_jobs=-1)

for i, (train_index, test_index) in enumerate(tss.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    print('Score:', model.score(X_test, y_test))

    y_test_hat = model.predict(X_test)
    print('R2:', r2_score(y_test, y_test_hat))

Fold 0:
  Train: index=[    0     1     2 ... 39277 39278 39279]
  Test:  index=[39280 39281 39282 ... 78555 78556 78557]
Score: -7.699101446533047e+21
R2: -7.699101446533047e+21
Fold 1:
  Train: index=[    0     1     2 ... 78555 78556 78557]
  Test:  index=[ 78558  78559  78560 ... 117833 117834 117835]
Score: -3.687961752335117e+17
R2: -3.687961752335117e+17
Fold 2:
  Train: index=[     0      1      2 ... 117833 117834 117835]
  Test:  index=[117836 117837 117838 ... 157111 157112 157113]
Score: -4.015204357651543e+18
R2: -4.015204357651543e+18
Fold 3:
  Train: index=[     0      1      2 ... 157111 157112 157113]
  Test:  index=[157114 157115 157116 ... 196389 196390 196391]
Score: -3.924897823835091e+20
R2: -3.924897823835091e+20
Fold 4:
  Train: index=[     0      1      2 ... 196389 196390 196391]
  Test:  index=[196392 196393 196394 ... 235667 235668 235669]
Score: 0.4084778634542571
R2: 0.4084778634542571
