In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import roc_auc_score
import time
from datetime import datetime



### Вспомогательные функции

In [2]:
# формирование обучающей выборки (удаление данных итога матча)
def get_sample(X):
    X = X.drop(['duration', 
                        'tower_status_radiant', 
                         'tower_status_dire', 
                         'barracks_status_radiant', 
                         'barracks_status_dire'
                        ], axis=1)
    if target in X.columns:
        X.drop([target], axis=1, inplace=True)
    return X

### Загружаем данные

In [9]:
df_train = pd.read_csv('data/features.csv', index_col='match_id')
df_test = pd.read_csv('data/features_test.csv', index_col='match_id')

### Подготовка данных

In [10]:
target = 'radiant_win'
y_train = df_train[target]
X_train = get_sample(df_train)

# Заполняем пропуски
X_train.fillna(-999, inplace=True)

# Генератор разбиений для кросс-валидации
kf = KFold(len(X_train), n_folds=5, shuffle=True, random_state=42)

### Подбор оптимального числа деревьев

In [None]:
scores = []
estimators = [10, 20, 30, 50, 100, 250]
for n in estimators:
    print('estimators = %s' % n)
    model = GradientBoostingClassifier(n_estimators=n, random_state=42)
    start_time = datetime.now()
    mean_cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=kf, scoring='roc_auc', n_jobs=-1))
    
    print('Time elapsed: %s, cv_score: %s' % (str(datetime.now() - start_time), mean_cv_score))
    scores.append(mean_cv_score)

estimators = 10
Time elapsed: 0:00:07.833742, cv_score: 0.6648506879750012
estimators = 20
Time elapsed: 0:00:12.850638, cv_score: 0.6824618768044435
estimators = 30
Time elapsed: 0:00:17.821494, cv_score: 0.6899923040820886
estimators = 50
Time elapsed: 0:00:28.188176, cv_score: 0.697409524655909
estimators = 100
Time elapsed: 0:00:51.460582, cv_score: 0.7062552062171374
estimators = 250


In [None]:
plt.plot(estimators, scores)
plt.xlabel('n_estimators')
plt.ylabel('score')
plt.show()

Кросс-валидация для градиентного бустинга с 30 деревьями заняла 0:00:17.82 (значение AUC-ROC = 0.69).