In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

In [2]:
# import data and format it
data = pd.read_csv('../Data/gss_subset_cleaned.csv')
data = data[data['year']> 2005]
# drop highly correlated columns and unusable columns identified in earlier analysis
data.drop(['paeduc', 'maeduc', 'speduc', 'income', 'satjob', 'goodlife', 
           'health', 'year', 'hompop', 'earnrs'], axis=1, inplace=True)
# drop less important features identified through random forest model, and drop subjective features
data.drop(['babies', 'preteen', 'teens', 'divorce', 'dwelling', 'sex', 
           'satfin', 'weekswrk', 'polviews'], axis=1, inplace=True)
# drop NA
data.dropna(inplace=True)

In [3]:
# set X and y
X = pd.get_dummies(data.drop('happy', axis=1), drop_first=True)
y = data['happy'] == 1 # y is "true" when individual is unhappy

In [4]:
# look at % in each class (baseline accuracy)
y.value_counts()/y.count()

False    0.85787
True     0.14213
Name: happy, dtype: float64

In [5]:
# SVM with default parameters
svc = svm.SVC()
cross_val_score(svc, X, y, scoring = 'roc_auc')

array([ 0.58773915,  0.57919891,  0.58257219])

In [6]:
# linear SVC
svc = svm.SVC(kernel='linear')
cross_val_score(svc, X, y, scoring = 'roc_auc')

array([ 0.529038  ,  0.59179651,  0.57902106])

In [5]:
# define param_grid for gridsearch
param_grid = {
    'gamma': [.001,.01,.1,1,10,100],
    'C': [.01,.1,1,10],
    'kernel': ['linear', 'rbf']
}

In [7]:
# gridsearch optimal parameters
svc = svm.SVC()
gs_svc = GridSearchCV(svc, param_grid, n_jobs=-1, verbose=1, scoring = 'roc_auc')
gs_svc.fit(X, y)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  3.4min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'kernel': ['linear', 'rbf'], 'C': [0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=1)

In [8]:
print gs_svc.best_score_
print gs_svc.best_estimator_

0.597389851148
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


### Score of 0.59 is much worse than our better Logistic Regression and Random Forest models, so SVM does not work well for this problem