In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.grid_search import GridSearchCV

# set random state to use throughout
rs = 25

In [2]:
# import data and format it
data = pd.read_csv('../Data/gss_subset_cleaned.csv')
data = data[data['year']> 2005]
# drop highly correlated columns and unusable columns identified in earlier analysis
data.drop(['paeduc', 'maeduc', 'speduc', 'income', 'satjob', 'goodlife', 
           'health', 'year', 'hompop', 'earnrs'], axis=1, inplace=True)
# drop less important features from first run-through of RF
data.drop(['babies', 'preteen', 'teens', 'divorce', 'dwelling', 'sex'], axis=1, inplace=True)
data.dropna(inplace=True)

In [3]:
# set X and y
X = pd.get_dummies(data.drop('happy', axis=1), drop_first=True)
y = data['happy'] > 1

In [4]:
# look at % in each class
y.value_counts()/y.count()

True     0.858185
False    0.141815
Name: happy, dtype: float64

In [5]:
# do train_test split
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(),
        stratify = y, test_size=0.2, random_state=rs)

In [6]:
# instantiate Random Forest
rf = RandomForestClassifier(random_state=rs, n_jobs=-1)

In [57]:
# define param grid for gridsearch
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [12,15,18],
    'n_estimators': [12,15,18],
    'min_samples_split': [40,50,60],
    'min_samples_leaf': [5,10,20],
    'max_features': [5,10,'auto']
}

In [58]:
# instantiate gridsearch for random forest
gsrf = GridSearchCV(rf, param_grid, n_jobs=-1)

In [59]:
gsrf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=25, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_samples_leaf': [5, 10, 20], 'n_estimators': [12, 15, 18], 'max_features': [5, 10, 'auto'], 'criterion': ['gini', 'entropy'], 'min_samples_split': [40, 50, 60], 'max_depth': [12, 15, 18]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [60]:
gsrf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=5, min_samples_split=50,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
            oob_score=False, random_state=25, verbose=0, warm_start=False)

In [61]:
gsrf.score(X_test, y_test)

0.85720640569395012

In [62]:
gsrf_proba = gsrf.predict_proba(X_test)
gsrf_pred = gsrf_proba[:,1] > 0.9

In [63]:
print classification_report(y_test, gsrf_pred)
print confusion_matrix(y_test, gsrf_pred)

             precision    recall  f1-score   support

      False       0.23      0.84      0.36       319
       True       0.95      0.53      0.69      1929

avg / total       0.85      0.58      0.64      2248

[[ 268   51]
 [ 897 1032]]


In [64]:
features = X.columns
importances = gsrf.best_estimator_.feature_importances_
feat_importances = pd.DataFrame(index=features, columns = ['importance'])
feat_importances['importance'] = importances

In [65]:
feat_importances.sort_values(by='importance', ascending = False)

Unnamed: 0,importance
satfin,0.254712
age,0.122917
educ,0.102473
weekswrk,0.07527
polviews,0.071848
sibs,0.065506
marital_married,0.06472
adults,0.048811
childs,0.043557
dwelown_owns,0.032845
