In [1]:
# Setup
import warnings; warnings.simplefilter('ignore')
# set this to your working directory
WORKING_DIR = '/home/elliott/Dropbox/_Ash_Teaching/2018-09 - Bocconi - Text Data and ML/code'
import os
os.chdir(WORKING_DIR)
%matplotlib notebook

import pandas as pd
df1 = pd.read_csv('death-penalty-cases.csv')
Xraw = pd.read_pickle('X.pkl')
vocab = pd.read_pickle('vocab.pkl')

In [2]:
###
# OLS Regression
###

# list of words from our vectorizer
vocab = [w.replace(' ', '_') for w in vocab]
         
# convert frequency counts to dataframe
df4 = pd.DataFrame(Xraw.todense(),
                   columns=vocab)

# import statsmodels package for R-like regression formulas
import statsmodels.formula.api as smf

# add metadata
df4['Y'] = df1['citeCount'] # cites to this opinion
df4['courtfe'] = df1['court_id']   # court fixed effect
df4['yearfe'] = df1['year']        # year fixed effect

# empty lists for t-statistics and coefficients
tstats, betas = [], []

for xvar in vocab: # loop through the words in vocab
    if any([c.isdigit() for c in xvar]) or 'hellip' in xvar:
        tstats.append(0)
        betas.append(0)
        continue
    model = smf.ols('Y ~ %s' % xvar,data=df4)                
    result = model.fit() 
    tstats.append(result.tvalues[1])
    betas.append(result.params[1])
            
# save estimates
pd.to_pickle(tstats,'tstats.pkl')    
pd.to_pickle(betas,'betas.pkl')

# zip up words and t-statistics
stats = list(zip(vocab,tstats))
stats.sort(key = lambda x: x[1], reverse=True) # sort by second item (tstats)
stats[:10] + stats[-10:]

[('texas', 15.157060829812988),
 ('views', 14.162723136391637),
 ('penalty_quot', 13.987539799853783),
 ('death_penalty_quot', 13.882195791855306),
 ('death_penalty_law', 13.796881088968187),
 ('penalty_law', 13.786978075466232),
 ('vote_death_penalty', 13.200971205191937),
 ('vote_death', 13.198037293642914),
 ('vote', 12.90649508072566),
 ('views_death', 12.742941138262612),
 ('act', -12.866619826449174),
 ('aedpa', -12.908893663389573),
 ('antiterrorism', -13.023546041592605),
 ('antiterrorism_effective', -13.047299668279841),
 ('antiterrorism_effective_death', -13.047299668279841),
 ('death_penalty_act', -13.492546471661809),
 ('penalty_act', -13.492546471661809),
 ('effective', -13.662906539833477),
 ('effective_death', -14.225782953608991),
 ('effective_death_penalty', -14.225782953608991)]

In [3]:
# Overfitting
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.linear_model import LinearRegression

m = 100
X = 6 * np.random.rand(m,1) - 3
y = 0.5 * X ** 2 + X + 2 + np.random.randn(m,1)
y = y.ravel()

from sklearn.preprocessing import PolynomialFeatures
poly_2 = PolynomialFeatures(degree=2) # also adds interactions
X_poly_2 = poly_2.fit_transform(X)


poly_300 = PolynomialFeatures(degree=300) 
X_poly_300 = poly_300.fit_transform(X)


lin_reg = LinearRegression()
cross_val_score(lin_reg, X, y, cv=3, n_jobs=3).mean()

0.50905345213711373

In [4]:
cross_val_score(lin_reg, X_poly_2, y, cv=3, n_jobs=3).mean()

0.80327148960479178

In [5]:
cross_val_score(lin_reg, X_poly_300, y, cv=3, n_jobs=3).mean()

-1.631660878146161e+17

In [17]:
# Lasso
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
cross_val_score(lasso_reg,X,y).mean()

0.50889492877877351

In [16]:
# Ridge
from sklearn.linear_model import Ridge, SGDRegressor
ridge_reg = Ridge(alpha=1)
cross_val_score(ridge_reg,X,y).mean()

0.50917225428549584

In [14]:
###
# Elastic Net
###
from sklearn.linear_model import ElasticNetCV
enet_reg = ElasticNetCV(alphas=[.0001, .001, .01,.1,1], l1_ratio=[.0001, .001, .01,.1,.5,.9, .99, 1])
enet_reg.fit(X,y)
enet_reg.alpha_, enet_reg.l1_ratio_

(0.01, 0.0001)

In [15]:
cross_val_score(enet_reg,X,y).mean()

0.50930439262658389

In [18]:
# Scaling with Sparsity
from sklearn.preprocessing import StandardScaler
sparse_scaler = StandardScaler(with_mean=False)
X_sparse = sparse_scaler.fit_transform(Xraw)



In [11]:
X_sparse

<32567x472 sparse matrix of type '<class 'numpy.float64'>'
	with 460029 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(C=1, # default L2 penalty
                              class_weight='balanced')

scores = cross_val_score(logistic,
                         X_sparse[:1000],
                         df1['state'][:1000],
                         cv=3,
                         n_jobs=3)

scores.mean(), scores.std()

(0.4130743391509582, 0.0073986758019537807)