In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

In [3]:
train = pd.read_csv('train_dataset.csv')

In [4]:
train.head()

Unnamed: 0,id,location,fault_severity,event_type 1,event_type 10,event_type 11,event_type 12,event_type 13,event_type 14,event_type 15,...,feature 90 volume,feature 91 volume,feature 92 volume,feature 93 volume,feature 94 volume,feature 95 volume,feature 96 volume,feature 97 volume,feature 98 volume,feature 99 volume
0,14121,location 118,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9320,location 91,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,14394,location 152,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8218,location 931,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,14804,location 120,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
test = pd.read_csv('test_dataset.csv')

In [6]:
train.shape

(7381, 843)

In [7]:
test.shape

(11171, 842)

In [8]:
sample = pd.read_csv('sample_submission.csv')

In [9]:
sample.head()

Unnamed: 0,id,predict_0,predict_1,predict_2
0,11066,0,1,0
1,18000,0,1,0
2,16964,0,1,0
3,4795,0,1,0
4,3392,0,1,0


In [10]:
param = {'n_estimators': list(range(30, 80))}

In [11]:
rfc = RandomForestClassifier(n_jobs=-1)

In [12]:
gs = GridSearchCV(rfc, param)

In [13]:
gs.fit(train.loc[:, 'event_type 1':], train.loc[:, 'fault_severity'])

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'n_estimators': [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [14]:
gs.best_params_

{'n_estimators': 46}

In [16]:
param = {'min_samples_split': list(range(2, 10))}

In [17]:
rfc = RandomForestClassifier(n_estimators=46, n_jobs=-1)

In [19]:
gs.fit(train.loc[:, 'event_type 1':], train.loc[:, 'fault_severity'])

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=46, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [20]:
gs.best_params_

{'min_samples_split': 8}

In [21]:
gs.best_score_

0.74827259178973038

In [23]:
rfc = RandomForestClassifier(n_estimators=65, min_samples_split=8)

In [24]:
rfc.fit(train.loc[:, 'event_type 1':], train.loc[:, 'fault_severity'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=8,
            min_weight_fraction_leaf=0.0, n_estimators=65, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
result = rfc.predict(test.loc[:, 'event_type 1':])

In [27]:
result.shape

(11171,)

In [28]:
rfc.classes_

array([0, 1, 2])

In [29]:
prob = rfc.predict_proba(test.loc[:, 'event_type 1':])

In [30]:
prob.shape

(11171, 3)

In [31]:
rfc_res = pd.DataFrame({
    'id': sample.loc[:, 'id'].values, 
    'predict_0': prob[:, 0],
    'predict_1': prob[:, 1],
    'predict_2': prob[:, 2]})

In [32]:
rfc_res.head()

Unnamed: 0,id,predict_0,predict_1,predict_2
0,11066,1.0,0.0,0.0
1,18000,0.101329,0.013493,0.885179
2,16964,1.0,0.0,0.0
3,4795,0.598084,0.382568,0.019347
4,3392,0.24072,0.75928,0.0


In [34]:
rfc_res.to_csv('randomforest.csv', index=False)