In [17]:
from sklearn.linear_model import LogisticRegression 
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
import pandas as pd
import numpy as np 

In [18]:
# import data and format it
data = pd.read_csv('../Data/gss_subset_cleaned.csv')
data = data[data['year']> 2005]

# drop subjective columns and unusable columns
data.drop(['paeduc', 'maeduc', 'speduc', 'income', 'satjob', 'satfin', 'goodlife', 
           'health', 'year', 'hompop', 'earnrs', 'weekswrk', 'polviews'], axis=1, inplace=True)

# drop NA
data.dropna(inplace=True)

In [19]:
# set X and y.drop non-demographic features(ie would need to be collected in interview)
X = pd.get_dummies(data.drop('happy', axis=1), drop_first=True)
y = data['happy'] == 1 # y is "true" when individual is unhappy

In [43]:
C_range = 10.**np.arange(-2, 3)
param_grid = {'C':C_range, 'fit_intercept':[True, False], 
              'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag']}

In [44]:
log_reg = LogisticRegression(n_jobs=-1)

In [45]:
grid = GridSearchCV(log_reg, param_grid, cv=10, scoring='roc_auc')
grid.fit(X, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02]), 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag'], 'fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [47]:
print grid.best_estimator_
grid.grid_scores_

LogisticRegression(C=0.10000000000000001, class_weight=None, dual=False,
          fit_intercept=False, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


[mean: 0.69141, std: 0.02062, params: {'C': 0.01, 'solver': 'newton-cg', 'fit_intercept': True},
 mean: 0.69145, std: 0.02079, params: {'C': 0.01, 'solver': 'lbfgs', 'fit_intercept': True},
 mean: 0.69135, std: 0.02076, params: {'C': 0.01, 'solver': 'liblinear', 'fit_intercept': True},
 mean: 0.69135, std: 0.02072, params: {'C': 0.01, 'solver': 'sag', 'fit_intercept': True},
 mean: 0.69132, std: 0.02077, params: {'C': 0.01, 'solver': 'newton-cg', 'fit_intercept': False},
 mean: 0.69133, std: 0.02074, params: {'C': 0.01, 'solver': 'lbfgs', 'fit_intercept': False},
 mean: 0.69131, std: 0.02077, params: {'C': 0.01, 'solver': 'liblinear', 'fit_intercept': False},
 mean: 0.69131, std: 0.02075, params: {'C': 0.01, 'solver': 'sag', 'fit_intercept': False},
 mean: 0.69310, std: 0.02279, params: {'C': 0.10000000000000001, 'solver': 'newton-cg', 'fit_intercept': True},
 mean: 0.69324, std: 0.02279, params: {'C': 0.10000000000000001, 'solver': 'lbfgs', 'fit_intercept': True},
 mean: 0.69321, std: