In [3]:
import pandas as pd
import numpy as np
from sklearn import cross_validation, linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

In [4]:
with open('match_history_sample.json') as f:
    arr = [next(f) for x in xrange(300)]

In [5]:
t = map(lambda x: pd.read_json(x), arr)

In [6]:
df = pd.concat(t,ignore_index=True)

In [8]:
df = pd.DataFrame(df.matches.tolist())

In [9]:
df_10player = df[df['human_players']==10]

In [10]:
def match_features(players):
    vec = np.zeros(224, dtype=np.int)
    loc = np.append(np.zeros(5, dtype=np.int), np.ones(5, dtype=np.int)*112)
    loc+=map(lambda x: x['hero_id']-1, players)
    vec[loc]=1
    return vec

def matches_features(df):
    df_feature = df['players'].apply(lambda x: match_features(x))
    X = pd.DataFrame(df_feature.tolist())
    y = df['radiant_win']
    return X, y

In [11]:
X, y = matches_features(df_10player)

## Random Forest

In [16]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.8)

In [13]:
# Set the parameters by cross validation

tuned_parameters =  {'n_estimators':[10, 100, 1000],
                     'max_features':[5, 'auto', None],
                     'min_samples_leaf':[1, 50, 150]}
                    

In [14]:
clf_rf = RandomForestClassifier(n_jobs=-1, random_state=50)

In [15]:
clf = GridSearchCV(clf_rf, tuned_parameters, cv=5)

In [17]:
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=50, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 100, 1000], 'max_features': [5, 'auto', None], 'min_samples_leaf': [1, 50, 150]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [18]:
clf.best_params_

{'max_features': 'auto', 'min_samples_leaf': 50, 'n_estimators': 100}

In [19]:
clf.best_score_

0.57214187327823696

In [20]:
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=50, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=50, verbose=0, warm_start=False)

In [23]:
clf.grid_scores_

[mean: 0.51911, std: 0.01306, params: {'max_features': 5, 'n_estimators': 10, 'min_samples_leaf': 1},
 mean: 0.55303, std: 0.01075, params: {'max_features': 5, 'n_estimators': 100, 'min_samples_leaf': 1},
 mean: 0.56680, std: 0.01256, params: {'max_features': 5, 'n_estimators': 1000, 'min_samples_leaf': 1},
 mean: 0.55561, std: 0.01197, params: {'max_features': 5, 'n_estimators': 10, 'min_samples_leaf': 50},
 mean: 0.55975, std: 0.00442, params: {'max_features': 5, 'n_estimators': 100, 'min_samples_leaf': 50},
 mean: 0.55647, std: 0.00456, params: {'max_features': 5, 'n_estimators': 1000, 'min_samples_leaf': 50},
 mean: 0.53030, std: 0.00411, params: {'max_features': 5, 'n_estimators': 10, 'min_samples_leaf': 150},
 mean: 0.52531, std: 0.00018, params: {'max_features': 5, 'n_estimators': 100, 'min_samples_leaf': 150},
 mean: 0.52531, std: 0.00018, params: {'max_features': 5, 'n_estimators': 1000, 'min_samples_leaf': 150},
 mean: 0.51636, std: 0.01105, params: {'max_features': 'auto', '

In [24]:
rf = RandomForestClassifier(n_estimators=10000, min_samples_leaf=50, n_jobs=-1)
cross_validation.cross_val_score(rf, X, y, cv=5)

array([ 0.58512653,  0.57927354,  0.58667585,  0.58340506,  0.58626033])

In [25]:
rf = RandomForestClassifier(n_estimators=20000, min_samples_leaf=50, n_jobs=-1)
cross_validation.cross_val_score(rf, X, y, cv=5)

array([ 0.58495438,  0.57858495,  0.5865037 ,  0.58374935,  0.58660468])