In [73]:
from sklearn.ensemble import RandomForestClassifier as RFC
import pandas as pd
import numpy as np
from sklearn.cross_validation import cross_val_score as CVS
from sklearn.metrics import roc_auc_score as RAS

In [24]:
df_d = pd.read_csv('data/final/demand_all_features.csv', parse_dates=['date'], infer_datetime_format=True)


In [17]:
df_d['hour'] = df_d['hour'].astype('category')
df_d['cluster'] = df_d['cluster'].astype('category')

In [28]:
jd1 = pd.get_dummies(df_d['hour'])
jd2 = pd.get_dummies(df_d['cluster'])

df_d = pd.concat([df_d, jd1, jd2], axis=1)      
df_d.drop(['hour', 'cluster','Night',0], inplace=True, axis=1)

In [47]:
df_d = df_d.fillna(0)
df_d.ix[df_d['count'] > 0, 'count'] = 1

In [48]:
rf = RFC(n_jobs=-1, n_estimators=100)

In [88]:
train = df_d.ix[df_d['date'].map(lambda x: x.month).isin([1,2,3,4,5,6,7]), ~df_d.columns.isin(['ecosystem','terminal'])]
test = df_d.ix[df_d['date'].map(lambda x: x.month).isin([8]), ~df_d.columns.isin(['ecosystem','terminal'])]

In [89]:
train_x, train_y = train.drop(['count','date'], axis=1).values , train['count'].values
test_x, test_y = test.drop(['count','date'], axis=1).values , test['count'].values

In [90]:
rf.score(test_x, test_y)

0.72395833333333337

In [76]:
RAS(test_y, rf.predict(test_x))

0.72438144801530013

In [81]:
CVS(rf, train_x, train_y, cv=5)

array([ 0.66445822,  0.61857947,  0.64358481,  0.64633799,  0.62793017])

In [91]:
feature_names = df_d.drop(['count', 'date'], axis=1).columns
importances = rf.feature_importances_
important_names = feature_names[importances > np.mean(importances)]
print important_names

Index([u'terminal', u'TEMP', u'PCP01', u'VSB', u'DEWP', u'SLP', u'STP',
       u'dockcount', u'Commute_from_work'],
      dtype='object')


In [92]:
from sklearn.metrics import confusion_matrix

In [93]:
confusion_matrix(test_y, rf.predict(test_x))

array([[2235,  830],
       [1025, 2630]])