Загружаем данные

In [1]:
import pandas

features = pandas.read_csv('d:/features.csv')
features = features.drop('match_id', 1)

Ищем пропущенные данные

In [2]:
total = features.shape[0]
for column, n in features.count().iteritems():
    if n < total:
        print(column, '-', n)

first_blood_time - 77677
first_blood_team - 77677
first_blood_player1 - 77677
first_blood_player2 - 53243
radiant_bottle_time - 81539
radiant_courier_time - 96538
radiant_flying_courier_time - 69751
radiant_first_ward_time - 95394
dire_bottle_time - 81087
dire_courier_time - 96554
dire_flying_courier_time - 71132
dire_first_ward_time - 95404


Пропуски есть в полях:
first_blood_time
first_blood_team
first_blood_player1
first_blood_player2
radiant_bottle_time
radiant_courier_time
radiant_flying_courier_time
radiant_first_ward_time
dire_bottle_time
dire_courier_time
dire_flying_courier_time
dire_first_ward_time

first_blood_time, first_blood_team не заполнены, если событие first_blood не успело произойти за первые 5 минут
Пропуски в полях с суффиксом time могут означать то, что данное событие не наступило во время матча

Заполняем пропуски

In [3]:
for column in features:
    features[column].fillna(0, inplace=True)

result = features[
    ['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire']]

features = features.drop(
    ['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire']
    , 1)

y = result['radiant_win']

Запускаем кросс-валидацию, меряем результат и время обучения

In [5]:
import datetime
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score

for n in [10, 20, 30, 40, 50, 70, 100]:
    start_time = datetime.datetime.now()
    model = GradientBoostingClassifier(n_estimators=n)
    print('N:', n)
    print('Score:', np.mean(cross_val_score(model, features, y, scoring='roc_auc', cv=KFold(5, True))))
    print('Time elapsed:', datetime.datetime.now() - start_time)

N: 10


Score: 0.664538817418
Time elapsed: 0:00:36.428747
N: 20


Score: 0.682312198885
Time elapsed: 0:01:07.615881
N: 30


Score: 0.689047050245
Time elapsed: 0:01:41.391829
N: 40


Score: 0.694394424428
Time elapsed: 0:02:24.932685
N: 50


Score: 0.697258937725
Time elapsed: 0:02:46.624546
N: 70


Score: 0.701989434606
Time elapsed: 0:03:51.518913
N: 100


Score: 0.706657421038
Time elapsed: 0:05:24.972388


Результат:

N: 10
Score: 0.664538817418
Time elapsed: 0:00:36.428747
N: 20
Score: 0.682312198885
Time elapsed: 0:01:07.615881
N: 30
Score: 0.689047050245
Time elapsed: 0:01:41.391829
N: 40
Score: 0.694394424428
Time elapsed: 0:02:24.932685
N: 50
Score: 0.697258937725
Time elapsed: 0:02:46.624546
N: 70
Score: 0.701989434606
Time elapsed: 0:03:51.518913
N: 100
Score: 0.706657421038
Time elapsed: 0:05:24.972388

Для увеличения скорости при большом количестве деревьев можно проводить обучение на подвыборке
Увеличение числа деревьев положительно сказывается на качестве

Подготовка данных и использованием "мешка слов" по героям. Очень долгая операция

In [None]:
import pandas

features = pandas.read_csv('d:/features.csv')

for i in range(112):
    features['hero_%d' % i] = 0

for i in features.index:
    for p in range(5):
        rHeroId = features['r%d_hero' % (p + 1)][i] - 1
        dHeroId = features['d%d_hero' % (p + 1)][i] - 1
        features.loc[i, 'hero_%d' % rHeroId] = 1
        features.loc[i, 'hero_%d' % dHeroId] = -1
    if (i % 1000) == 0:
        print(i)

features.to_csv('d:/features_new.csv')

Запуск на необработаных данных

In [13]:
import datetime
import pandas
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler

features = pandas.read_csv('d:/features.csv')
features = features.drop('match_id', 1)

for column in features:
    features[column].fillna(0, inplace=True)

result = features[
    ['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant',
     'barracks_status_dire']]

features = features.drop(
    ['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant',
     'barracks_status_dire'], 1)

y = result['radiant_win']

features = StandardScaler().fit_transform(features)

start_time = datetime.datetime.now()
for c in [0.0001, 0.001, 0.01, 0.1, 1, 10]:
    model = LogisticRegression(C=c)
    print('C:', c)
    print('Score:', np.mean(cross_val_score(model, features, y, scoring='roc_auc', cv=KFold(5, True))))
    print('Time elapsed:', datetime.datetime.now() - start_time)

C: 0.0001


Score: 0.711219438464
Time elapsed: 0:00:07.587276
C: 0.001


Score: 0.716294750484
Time elapsed: 0:00:21.687790
C: 0.01


Score: 0.71628257256
Time elapsed: 0:00:40.120541
C: 0.1


Score: 0.716361585729
Time elapsed: 0:00:59.634154
C: 1


Score: 0.716440172266
Time elapsed: 0:01:17.472883
C: 10


Score: 0.716395627349
Time elapsed: 0:01:39.049593


Результат:

C: 0.0001
Score: 0.711356722518
Time elapsed: 0:00:07.535012
C: 0.001
Score: 0.716257272205
Time elapsed: 0:00:21.240223
C: 0.01
Score: 0.716402881464
Time elapsed: 0:00:39.361165
C: 0.1
Score: 0.716561214872
Time elapsed: 0:00:59.234872
C: 1
Score: 0.716362982712
Time elapsed: 0:01:18.596599
C: 10
Score: 0.716420149552
Time elapsed: 0:01:37.920477

Логистическая регрессия работает существенно быстрее и дает чуть лучший результат

In [14]:
import datetime
import pandas
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler

features = pandas.read_csv('d:/features.csv')

features = features.drop('match_id', 1)
features = features.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero',
                          'd3_hero', 'd4_hero', 'd5_hero'], 1)

for column in features:
    features[column].fillna(0, inplace=True)

result = features[
    ['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant',
     'barracks_status_dire']]

features = features.drop(
    ['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant',
     'barracks_status_dire'], 1)

y = result['radiant_win']

features = StandardScaler().fit_transform(features)

start_time = datetime.datetime.now()
model = LogisticRegression(C=0.1)
print('Score:', np.mean(cross_val_score(model, features, y, scoring='roc_auc', cv=KFold(5, True))))
print('Time elapsed:', datetime.datetime.now() - start_time)

Score: 0.716482274514
Time elapsed: 0:00:17.961548


Удаление категориальных признаков не сказывается на качестве модели. Алгоритм не может использовать информацию из этих признаков в таком виде

Определяем количество различных идентификаторов героев

In [22]:
features = pandas.read_csv('d:/features.csv')
print(features['r1_hero'].nunique())
print(min(features['r1_hero'].values), '-', max(features['r1_hero'].values))
print('Except:')
for i in range(1, max(features['r1_hero'].values)):
    if i not in features['r1_hero'].values:
        print(i)


108
1 - 112
Except:
24
107
108
111


Результат:

108
1 - 112
Except:
24
107
108
111

In [15]:
import datetime
import pandas
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler

features = pandas.read_csv('d:/features_new.csv')

features = features.drop(['match_id', 'Unnamed: 0'], 1)
features = features.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero',
                          'd3_hero', 'd4_hero', 'd5_hero'], 1)

for column in features:
    features[column].fillna(0, inplace=True)

result = features[
    ['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant',
     'barracks_status_dire']]

features = features.drop(
    ['duration', 'radiant_win', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant',
     'barracks_status_dire'], 1)

y = result['radiant_win']

scaler = StandardScaler()
features = scaler.fit_transform(features)

start_time = datetime.datetime.now()
model = LogisticRegression(C=0.1)
print('Score:', np.mean(cross_val_score(model, features, y, scoring='roc_auc', cv=KFold(5, True))))
print('Time elapsed:', datetime.datetime.now() - start_time)

Score: 0.751928859719
Time elapsed: 0:00:34.616399


Результат:
Score: 0.751928859719
Time elapsed: 0:00:34.616399


Обучаем модель на всех данных
Считаем минимальное и максимальное значение прогноза

In [16]:
model = LogisticRegression(C=0.1)
model.fit(features, y)

features_test = pandas.read_csv('d:/features_test_new.csv')
features_test = features_test.drop(['Unnamed: 0', 'match_id'], 1)
features_test = features_test.drop(
    ['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero',
     'd3_hero', 'd4_hero', 'd5_hero'], 1)

for column in features_test:
    features_test[column].fillna(0, inplace=True)

features_test = scaler.fit_transform(features_test)
y_test = model.predict_proba(features_test)

maxX, maxD, minX, minD = 0, 0, 1, 1

for y in y_test:
    if abs(y[0] - y[1]) > maxD:
        maxD = abs(y[0] - y[1])
        maxX = max(y[0], y[1])
    if abs(y[0] - y[1]) < minD:
        minD = abs(y[0] - y[1])
        minX = max(y[0], y[1])

print(minX, maxX)

0.500019866427 0.996471254335


Результат:
0.500019866427 - минимальное значение прогноза
0.996471254335 - максимальное значение прогноза