In [2]:
import pandas
import sklearn
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
import time
import datetime
from sklearn.cross_validation import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

Загрузка и предобработка данных:

In [3]:
features = pandas.read_csv('D:\\features.csv')

nulls = []
for i in range(0, features.columns.size):
    if(features.isnull().any()[i] == True):
        nulls = nulls + [features.columns[i]]

features = features.fillna(0)
y = features['radiant_win']
finish_data = ["duration", "radiant_win", "tower_status_radiant", "tower_status_dire", "barracks_status_dire", 
               "barracks_status_radiant"]
for i in finish_data:
    del features[i]

X = features.as_matrix()

Список признаков, имеющих пропуски:

In [24]:
nulls

['first_blood_time',
 'first_blood_team',
 'first_blood_player1',
 'first_blood_player2',
 'radiant_bottle_time',
 'radiant_courier_time',
 'radiant_flying_courier_time',
 'radiant_first_ward_time',
 'dire_bottle_time',
 'dire_courier_time',
 'dire_flying_courier_time',
 'dire_first_ward_time']

Градиентный бустинг на 10, 20, 30 и 50 деревьях:

In [74]:
kf = KFold(n = y.size, n_folds = 5, shuffle = True)
n_trees = [10, 20, 30, 50]

for i in n_trees:
    clf = GradientBoostingClassifier(n_estimators = i)
    acc = np.arange(5, dtype = np.float)
    k = 0
    start_time = datetime.datetime.now()
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:, 1]
        acc[k] = roc_auc_score(y_test, y_pred)
        k = k + 1
    print 'Number of trees:', i
    print 'Time elapsed:', datetime.datetime.now() - start_time
    print 'Accuracy:', acc.sum() / acc.size

Number of trees: 10
Time elapsed: 0:00:37.839000
Accuracy: 0.664510618674
Number of trees: 20
Time elapsed: 0:01:10.799000
Accuracy: 0.682445695009
Number of trees: 30
Time elapsed: 0:01:44.403000
Accuracy: 0.689098257857
Number of trees: 50
Time elapsed: 0:02:57.970000
Accuracy: 0.697154100919


Создание обучающей выборки для двух случаев: обучения по всем признакам (матрица Х) и для обучения на части признаков, исключая категориальные (матрица Х1)

In [7]:
X = features.as_matrix()
X = StandardScaler().fit_transform(X)
categ = ['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero',
         'd5_hero']
features1 = features.copy()
for i in categ:
    del features1[i]
    
X1 = features1.as_matrix()
X1 = StandardScaler().fit_transform(X1)

Обучение и результаты для полной матрицы признаков:

In [10]:
X = features.as_matrix()
X = StandardScaler().fit_transform(X)
kf = KFold(n = y.size, n_folds = 5, shuffle = True)
param = [0.001, 0.01, 0.1, 1, 10, 100]
for i in param:
    clf = LogisticRegression(C = i)
    acc = np.arange(5, dtype = np.float)
    k = 0
    start_time = datetime.datetime.now()
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:, 1]
        acc[k] = roc_auc_score(y_test, y_pred)
        k = k + 1
    print 'C value:', i
    print 'Time elapsed:', datetime.datetime.now() - start_time
    print 'Accuracy:', acc.sum() / acc.size

C value: 0.001
Time elapsed: 0:00:11.163000
Accuracy: 0.716475779013
C value: 0.01
Time elapsed: 0:00:15.578000
Accuracy: 0.716657407178
C value: 0.1
Time elapsed: 0:00:16.378000
Accuracy: 0.716629973006
C value: 1
Time elapsed: 0:00:16.240000
Accuracy: 0.716625981932
C value: 10
Time elapsed: 0:00:16.875000
Accuracy: 0.716625652393
C value: 100
Time elapsed: 0:00:17.324000
Accuracy: 0.716625570826


Обучение и результаты для неполной матрицы признаков:

In [11]:
X = X1
kf = KFold(n = y.size, n_folds = 5, shuffle = True)
param = [0.001, 0.01, 0.1, 1, 10, 100]
for i in param:
    clf = LogisticRegression(C = i)
    acc = np.arange(5, dtype = np.float)
    k = 0
    start_time = datetime.datetime.now()
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:, 1]
        acc[k] = roc_auc_score(y_test, y_pred)
        k = k + 1
    print 'C value:', i
    print 'Time elapsed:', datetime.datetime.now() - start_time
    print 'Accuracy:', acc.sum() / acc.size

C value: 0.001
Time elapsed: 0:00:10.656000
Accuracy: 0.716261261303
C value: 0.01
Time elapsed: 0:00:14.198000
Accuracy: 0.716408590554
C value: 0.1
Time elapsed: 0:00:13.066000
Accuracy: 0.716376568043
C value: 1
Time elapsed: 0:00:13.118000
Accuracy: 0.716370593845
C value: 10
Time elapsed: 0:00:13.318000
Accuracy: 0.716370015492
C value: 100
Time elapsed: 0:00:13.538000
Accuracy: 0.716369989019


Нахождение количества уникальных идентификаторов героев:

In [120]:
heroes = features[categ]
del heroes['lobby_type']
np.unique(heroes)


array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 109, 110, 112], dtype=int64)

Хотя размер массива равен 108, последний идентификатор равен 112, что наводит на мысль, что идентификаторов на самом деле 112, а не 108. Отсутствие же некоторых идентификаторов в массиве можно объяснить тем, что никто не выбирал этих 4 героев в тех матчах, которые попали в обучающую выборку.

Создание "мешка слов" и обучение на новом формате:

In [12]:
N = 112
X_pick = np.zeros((features.shape[0], N))
for i, match_id in enumerate(features.index):
    for p in xrange(5):
        X_pick[i, features.ix[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, features.ix[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [13]:
X = features.as_matrix()
X = StandardScaler().fit_transform(X)
kf = KFold(n = y.size, n_folds = 5, shuffle = True)
param = [0.001, 0.01, 0.1, 1, 10, 100]
for i in param:
    clf = LogisticRegression(C = i)
    acc = np.arange(5, dtype = np.float)
    k = 0
    start_time = datetime.datetime.now()
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:, 1]
        acc[k] = roc_auc_score(y_test, y_pred)
        k = k + 1
    print 'C value:', i
    print 'Time elapsed:', datetime.datetime.now() - start_time
    print 'Accuracy:', acc.sum() / acc.size

C value: 0.001
Time elapsed: 0:00:11.220000
Accuracy: 0.716298988847
C value: 0.01
Time elapsed: 0:00:15.430000
Accuracy: 0.716457563171
C value: 0.1
Time elapsed: 0:00:15.613000
Accuracy: 0.716423803707
C value: 1
Time elapsed: 0:00:15.641000
Accuracy: 0.716419524549
C value: 10
Time elapsed: 0:00:15.437000
Accuracy: 0.71641888359
C value: 100
Time elapsed: 0:00:15.310000
Accuracy: 0.716418787084


Наилучший результат по метрике AUC_ROC дает логистическая регрессия с параметром С = 0.01. Интересно, что более "правильная" обработка категориальных признаков не улучшает результат. Таким образом, итоговый алгоритм обучим на данных без обработки категориальных признаков, они все равно не дают вклада.

In [14]:
X = features.as_matrix()
X = StandardScaler().fit_transform(X)
clf = LogisticRegression(C = 0.01)
clf.fit(X, y)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [141]:
features_test = pandas.read_csv('D:\\features_test.csv')
features_test = features_test.fillna(0)

X_test = features_test.as_matrix()
X_test = StandardScaler().fit_transform(X_test)

probas = clf.predict_proba(X_test)

In [146]:
probas

array([[ 0.45630651,  0.54369349],
       [ 0.33700614,  0.66299386],
       [ 0.6528789 ,  0.3471211 ],
       ..., 
       [ 0.72428396,  0.27571604],
       [ 0.62218943,  0.37781057],
       [ 0.44980786,  0.55019214]])

In [145]:
probas.max()

0.99236820206422449

In [147]:
probas.min()

0.0076317979357755153