In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from pyprojroot import here

In [2]:
df = pd.read_csv(here("pandas/data/evals-mod-adj.csv"))
df.columns

Index(['Unnamed: 0', 'score', 'rank', 'ethnicity', 'gender', 'language', 'age',
       'cls_perc_eval', 'cls_did_eval', 'cls_students', 'cls_level',
       'cls_profs', 'cls_credits', 'bty_f1lower', 'bty_f1upper', 'bty_f2upper',
       'bty_m1lower', 'bty_m1upper', 'bty_m2upper', 'bty_avg',
       'tenure_eligible'],
      dtype='object')

In [3]:
X = df['bty_avg']
X = sm.add_constant(X)
model = sm.OLS(df['score'], X)
results_simple = model.fit()

In [4]:
results_simple.summary()

0,1,2,3
Dep. Variable:,score,R-squared:,0.035
Model:,OLS,Adj. R-squared:,0.033
Method:,Least Squares,F-statistic:,16.73
Date:,"Mon, 22 Mar 2021",Prob (F-statistic):,5.08e-05
Time:,08:36:35,Log-Likelihood:,-366.22
No. Observations:,463,AIC:,736.4
Df Residuals:,461,BIC:,744.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.8803,0.076,50.961,0.000,3.731,4.030
bty_avg,0.0666,0.016,4.090,0.000,0.035,0.099

0,1,2,3
Omnibus:,33.007,Durbin-Watson:,1.267
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38.796
Skew:,-0.709,Prob(JB):,3.76e-09
Kurtosis:,3.006,Cond. No.,14.9


In [5]:
LRresult = (results_simple.summary2().tables[1])
LRresult

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,3.880333,0.076143,50.960964,1.564021e-191,3.730702,4.029963
bty_avg,0.066637,0.016291,4.09044,5.081508e-05,0.034623,0.098651


In [6]:
results_simple.rsquared_adj

0.03292999149286491

In [7]:
df[['female', 'male']]= pd.get_dummies(df['gender'])

In [8]:
X = df[['bty_avg', 'male']]
X = sm.add_constant(X)
model = sm.OLS(df['score'], X)
results = model.fit()

In [9]:
result_table = results.summary2().tables[1]
result_table['Coef.']

const      3.747335
bty_avg    0.074155
male       0.172389
Name: Coef., dtype: float64

In [10]:
result_table['t']

const      44.265879
bty_avg     4.562841
male        3.432607
Name: t, dtype: float64

Holding all else constant we can see that males get a significantly higher score on average

In [11]:
results_simple.rsquared_adj

0.03292999149286491

In [12]:
results.rsquared_adj

0.055032774550898944

Here we can see that adding gender as a predictor is helpfun in predicting score  

In [13]:
mod = smf.ols(formula='score ~ bty_avg + rank', data=df)
res = mod.fit()
res.summary2().tables[1]

Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,3.981539,0.09078,43.859135,2.9432189999999995e-166,3.803143,4.159936
rank[T.tenure track],-0.160697,0.073951,-2.173012,0.03029015,-0.306021,-0.015372
rank[T.tenured],-0.126219,0.062662,-2.014299,0.04456101,-0.249359,-0.00308
bty_avg,0.067825,0.01655,4.098296,4.921486e-05,0.035303,0.100347


rank, ethnicity, gender, language, age, cls_perc_eval, cls_did_eval, cls_students, cls_level, cls_profs, cls_credits, bty_avg

In [14]:
fwds = df[[ 'score', 'rank',
 'ethnicity',
 'gender',
 'language',
 'age',
 'cls_perc_eval',
 'cls_did_eval',
 'cls_students',
 'cls_level',
 'cls_profs',
 'cls_credits',
 'bty_avg']]