In [7]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split

# define random state to use
rs = 25

In [8]:
# import saved data from clean-up done in Random Forest modeling process
data = pd.read_csv('../Data/gss_subset_for_modeling.csv')

In [9]:
# set X and y
X = pd.get_dummies(data.drop('happy', axis=1), drop_first=True)
y = data['happy'] == 1 # y is "true" when individual is unhappy

In [10]:
# look at % in each class (baseline accuracy)
y.value_counts()/y.count()

False    0.857725
True     0.142275
Name: happy, dtype: float64

In [11]:
# do train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
        stratify = y, test_size=0.2, random_state=rs)

In [12]:
# SVM with default parameters
svc = svm.SVC()
cross_val_score(svc, X_train, y_train, scoring = 'roc_auc')

array([ 0.55500526,  0.56688425,  0.55736048])

In [13]:
# linear SVC
svc = svm.SVC(kernel='linear')
cross_val_score(svc, X_train, y_train, scoring = 'roc_auc')

array([ 0.54263686,  0.54623807,  0.51044321])

In [14]:
# define param_grid for gridsearch to see if it is able to optimize for better score
param_grid = {
    'gamma': [.001,.01,.1,1,10,100],
    'C': [.01,.1,1,10],
    'kernel': ['linear', 'rbf']
}

In [15]:
# gridsearch optimal parameters
svc = svm.SVC()
gs_svc = GridSearchCV(svc, param_grid, n_jobs=-1, verbose=1, scoring = 'roc_auc')
gs_svc.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  2.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'kernel': ['linear', 'rbf'], 'C': [0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=1)

In [16]:
print gs_svc.best_estimator_
print gs_svc.best_score_
print gs_svc.score(X_test, y_test)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.578663269328




0.589171822428


### Score of 0.58 is much worse than our Logistic Regression and Random Forest models: SVM does not work well for this problem