In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.grid_search import GridSearchCV

# set random state to use throughout
rs = 25

In [2]:
# import data and format it
data = pd.read_csv('../Data/gss_subset_cleaned.csv')
data = data[data['year']> 2005]
# drop highly correlated columns and unusable columns identified in earlier analysis
data.drop(['paeduc', 'maeduc', 'speduc', 'income', 'satjob', 'goodlife', 
           'health', 'year', 'hompop', 'earnrs'], axis=1, inplace=True)
data.dropna(inplace=True)

In [3]:
# set X and y
X = pd.get_dummies(data.drop('happy', axis=1))
y = data['happy'] > 1

In [14]:
y.value_counts()/y.count()

True     0.857576
False    0.142424
Name: happy, dtype: float64

In [4]:
# do train_test split
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(),
        stratify = y, test_size=0.2, random_state=rs)

In [5]:
# instantiate Random Forest
rf = RandomForestClassifier(random_state=rs, n_jobs=-1)

In [6]:
# define param grid for gridsearch
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [22],
    'n_estimators': [15],
    'min_samples_split': [50],
    'min_samples_leaf': [1,5,10]
}

In [7]:
# instantiate gridsearch for random forest
gsrf = GridSearchCV(rf, param_grid, n_jobs=-1)

In [8]:
gsrf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=25, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [15], 'min_samples_split': [50], 'criterion': ['gini', 'entropy'], 'max_depth': [22], 'min_samples_leaf': [1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [9]:
gsrf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=22, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=50,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
            oob_score=False, random_state=25, verbose=0, warm_start=False)

In [10]:
gsrf.score(X_test, y_test)

0.85668350941662841

In [22]:
gsrf_proba = gsrf.predict_proba(X_test)
gsrf_pred = gsrf_proba[:,1] > 0.8

In [23]:
print classification_report(y_test, gsrf_pred)

             precision    recall  f1-score   support

      False       0.31      0.57      0.40       310
       True       0.92      0.79      0.85      1867

avg / total       0.83      0.76      0.78      2177

