In [203]:
%matplotlib inline
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss
import re

from sklearn.cross_validation import train_test_split

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso

from sklearn.grid_search import GridSearchCV

from sklearn.metrics import mean_squared_error

import statsmodels.formula.api as smf

In [182]:
df = pd.read_csv('MERGED2011_PP.csv', header=0)

In [183]:
dd = pd.read_csv('CollegeScorecardDataDictionary-09-12-2015.csv', header=0)

In [184]:
dd['dev-category'].value_counts()

completion    1023
academics      228
school         165
repayment      130
student         94
earnings        73
cost            65
aid             40
admissions      25
root             5
dtype: int64

In [185]:
dd['SOURCE'].value_counts()

NSLDS       1179
IPEDS        433
Treasury     105
FSA            3
dtype: int64

In [186]:
earning_cols = dd[dd['dev-category'] == 'earnings']['VARIABLE NAME']
print len(earning_cols)

73


In [187]:
repayment_cols = dd[dd['dev-category'] == 'repayment']['VARIABLE NAME']
print len(repayment_cols)

130


In [188]:
removed_cols = ['\xef\xbb\xbfUNITID', 'OPEID','opeid6', 'ZIP', 'INSTNM', 'CITY', 'sch_deg', 'st_fips']

total_len = df.shape[0]

for col in df.columns:
    
    bad_count = sum(df[col].isnull())
    if df[col].dtype == 'object':
        bad_count += df.loc[df[col] == 'PrivacySuppressed'].shape[0]
        
    if bad_count > total_len * (1 / 4.0):
        removed_cols.append(col)
    
print len(removed_cols)

1348


In [189]:
removed_cols_set = set(removed_cols)
removed_cols_set = removed_cols_set.union(list(earning_cols.values))
removed_cols_set = removed_cols_set.union(list(repayment_cols.values))

print len(removed_cols_set)

1393


In [190]:
cols_to_include = {'ADM_RATE', 'mn_earn_wne_p10','md_earn_wne_p10'}
removed_cols_set = removed_cols_set - cols_to_include
print len(removed_cols_set)

1390


In [191]:
removed_cols = list(removed_cols_set)
df.drop(removed_cols, axis=1, inplace=True)
print df.shape

(7675, 339)


In [192]:
df.dropna(inplace=True)
print df.shape

(2136, 339)


In [193]:
def is_privacy_surpressed(row):
    for col, value in row.iteritems():
        if value == 'PrivacySuppressed':
            return True
        
    return False
    
privacy_surpressed = df.apply(is_privacy_surpressed, axis=1)
df = df[~privacy_surpressed]
print df.shape

(854, 339)


In [194]:
previous_var = None
cat_vars = {'STABBR'}
for index, row in dd.iterrows():
    if (type(row['NAME OF DATA ELEMENT']) == float) and np.isnan(row['NAME OF DATA ELEMENT']):
        cat_vars.add(previous_var)
    else:
        previous_var = row['VARIABLE NAME']

print len(cat_vars)
cat_vars

24


{'AANAPII',
 'ANNHI',
 'CCBASIC',
 'CCSIZSET',
 'CCUGPROF',
 'CONTROL',
 'CURROPER',
 'DISTANCEONLY',
 'HBCU',
 'HIGHDEG',
 'HSI',
 'LOCALE',
 'MENONLY',
 'NANTI',
 'PBI',
 'PREDDEG',
 'RELAFFIL',
 'STABBR',
 'TRIBAL',
 'WOMENONLY',
 'locale2',
 'main',
 'region',
 'st_fips'}

In [195]:
for col in cat_vars.intersection(set(df.columns)):
    print col

CONTROL
STABBR
DISTANCEONLY
PREDDEG
HIGHDEG
region
CURROPER
main


In [196]:
for col in cat_vars.intersection(set(df.columns)):
    dummies = pd.get_dummies(df[col], prefix=col)
    df = pd.concat([df, dummies], axis=1)
    df.drop(col, inplace=True, axis=1)
    
print df.shape

(854, 408)


In [248]:
df_numeric = df.copy().convert_objects(convert_numeric=True)
df_numeric.drop(df_numeric.std()[df_numeric.std() == 0].index.values, axis=1, inplace=True)

Y1 = df_numeric['mn_earn_wne_p10'].values
Y2 = df_numeric['md_earn_wne_p10'].values

df_numeric.drop(['mn_earn_wne_p10','md_earn_wne_p10'], inplace=True, axis=1)

X = ss.zscore(df_numeric)
print X.shape

(854, 402)


In [261]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y2, test_size=0.3, random_state=42)

In [254]:
vanilla_lr = LinearRegression()
vanilla_lr = vanilla_lr.fit(X_train, Y_train)
Y_pred = vanilla_lr.predict(X_test)
print mean_squared_error(Y_test, Y_pred)

53181833.606


In [265]:
gs_params = {'alpha':[2**i for i in range(-10,20)]}
gc = GridSearchCV(estimator=Ridge(), param_grid=gs_params)
ridge_model = gc.fit(X_train, Y_train)
Y_pred = ridge_model.predict(X_test)
print mean_squared_error(Y_test, Y_pred)

15759644.2889


In [264]:
gs_params = {'alpha':[2**i for i in range(-10,20)]}
gc = GridSearchCV(estimator=Lasso(), param_grid=gs_params)
lasso_model = gc.fit(X_train, Y_train)
Y_pred = lasso_model.predict(X_test)
print mean_squared_error(Y_test, Y_pred)

15461008.1617


In [180]:
[col for col in df.columns if 'earn'.lower() in col.lower()]

[]

In [118]:
any(['UNITID' in col for col in df.columns])

True

In [28]:
x = [re.search(r'p\d$', col) for col in df.columns]
x = [m.string for m in x if m]
x.sort()
print len(x)
for m in x:
    print m

46
count_nwne_p6
count_nwne_p7
count_nwne_p8
count_nwne_p9
count_wne_inc1_p6
count_wne_inc2_p6
count_wne_inc3_p6
count_wne_indep0_inc1_p6
count_wne_indep0_p6
count_wne_indep1_p6
count_wne_male0_p6
count_wne_male1_p6
count_wne_p6
count_wne_p7
count_wne_p8
count_wne_p9
gt_25k_p6
gt_25k_p7
gt_25k_p8
gt_25k_p9
md_earn_wne_p6
md_earn_wne_p8
mn_earn_wne_inc1_p6
mn_earn_wne_inc2_p6
mn_earn_wne_inc3_p6
mn_earn_wne_indep0_inc1_p6
mn_earn_wne_indep0_p6
mn_earn_wne_indep1_p6
mn_earn_wne_male0_p6
mn_earn_wne_male1_p6
mn_earn_wne_p6
mn_earn_wne_p7
mn_earn_wne_p8
mn_earn_wne_p9
pct10_earn_wne_p6
pct10_earn_wne_p8
pct25_earn_wne_p6
pct25_earn_wne_p8
pct75_earn_wne_p6
pct75_earn_wne_p8
pct90_earn_wne_p6
pct90_earn_wne_p8
sd_earn_wne_p6
sd_earn_wne_p7
sd_earn_wne_p8
sd_earn_wne_p9
