Подход 1: градиентный бустинг.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, make_scorer
import time
import datetime

In [2]:
df = pd.read_csv("features.csv", index_col = "match_id")
df

Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time,duration,radiant_win,tower_status_radiant,tower_status_dire,barracks_status_radiant,barracks_status_dire
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,4,2,2,-52.0,2874,1,1796,0,51,0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,4,3,1,-5.0,2463,1,1974,0,63,1
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,4,3,1,13.0,2130,0,0,1830,0,63
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,4,2,0,27.0,1459,0,1920,2047,50,63
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,3,3,0,-16.0,2449,0,4,1974,3,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114402,1450265551,1,47,4,1706,1198,17,0,1,8,...,4,3,0,-24.0,2032,0,1792,1975,48,63
114403,1450277704,0,43,4,1793,1416,17,0,1,5,...,3,2,0,-17.0,1734,1,2038,6,63,3
114404,1450291848,1,98,4,1399,540,1,0,0,5,...,1,3,1,-15.0,2906,0,1796,1846,51,63
114405,1450292986,1,100,3,1135,766,6,0,2,6,...,3,3,1,-42.0,951,0,2039,2047,63,63


Выделим признаки, не связанные с итогами матча в переменную Х. Целевая переменная содержится в столбце radiant_win, выделим её в у.

In [3]:
X = df.loc[:,"start_time":"dire_first_ward_time"]
y = df.loc[:,"radiant_win"]

Посмотрим какие признаки имеют пропуски в своих данных. По всей видимости, отсутствие значений в этих признаках связано с тем, что не во всех матчах первое убийство или покупка курьера происходили за первые 5 минут. Отсутствующие значения заменим нулями.

In [4]:
print(X.count()[X.count() != 97230])
X = X.fillna(0)

first_blood_time               77677
first_blood_team               77677
first_blood_player1            77677
first_blood_player2            53243
radiant_bottle_time            81539
radiant_courier_time           96538
radiant_flying_courier_time    69751
radiant_first_ward_time        95394
dire_bottle_time               81087
dire_courier_time              96554
dire_flying_courier_time       71132
dire_first_ward_time           95404
dtype: int64


Замерим время кросс-валидации для градиентного бустинга с 10, 20 и 30 деревьями. Метрика качества AUC-ROC.

In [5]:
kf = KFold(n_splits= 5, shuffle = True)
for i in range(10, 40, 10):
    clf = GradientBoostingClassifier(n_estimators= i)
    start_time = datetime.datetime.now()
    res = np.mean(cross_val_score(estimator = clf, X = X, y = y, cv = kf, scoring = make_scorer(roc_auc_score)))
    print("result: {}, trees: {}, time: {} ".format(res, i, datetime.datetime.now() - start_time))

result: 0.6072594682614725, trees: 10, time: 0:01:31.432863 
result: 0.6258430270848787, trees: 20, time: 0:03:00.994154 
result: 0.6319969771873465, trees: 30, time: 0:04:27.549021 


Использование более 30 деревьев сильно замедлит обучение. Чтоб ускорить этот процесс можно сократить количество объектов и уменьшить глубину деревьев.

In [6]:
from sklearn.model_selection import ShuffleSplit
splitter = ShuffleSplit(n_splits = 1, train_size = 0.5)
for train, test in splitter.split(X):
    X_short = X.to_numpy()[train]
    y_short = y.to_numpy()[train]
    
kf = KFold(n_splits= 5, shuffle = True)
for i in range(10, 100, 10):
    clf = GradientBoostingClassifier(n_estimators= i, max_depth = 2)
    start_time = datetime.datetime.now()
    res = np.mean(cross_val_score(estimator = clf, X = X_short, y = y_short, cv = kf, scoring = make_scorer(roc_auc_score)))
    print("result: {}, trees: {}, time: {} ".format(res, i, datetime.datetime.now() - start_time))

result: 0.5935994662288578, trees: 10, time: 0:00:34.637892 
result: 0.617666870793317, trees: 20, time: 0:01:08.846090 
result: 0.6261347951930633, trees: 30, time: 0:01:43.973466 
result: 0.630585447732204, trees: 40, time: 0:02:16.750917 
result: 0.6353370339451131, trees: 50, time: 0:02:54.549803 
result: 0.6372560477314239, trees: 60, time: 0:03:27.784130 
result: 0.6384045752769429, trees: 70, time: 0:04:08.927931 
result: 0.6396300323960918, trees: 80, time: 0:04:51.030437 
result: 0.640797661242594, trees: 90, time: 0:05:36.088810 


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
for coef in np.geomspace(0.0001, 1000, 8):
    LRclf = LogisticRegression(penalty = 'l2', C = coef)
    start_time = datetime.datetime.now()
    res = np.mean(cross_val_score(estimator = LRclf, X = X_scaled, y = y, cv = kf, scoring = make_scorer(roc_auc_score)))
    print("result: {}, C = {}, time: {}".format(res, coef, datetime.datetime.now() - start_time))

result: 0.6497630520605198, C = 0.0001, time: 0:00:02.941211
result: 0.6534532025538347, C = 0.001, time: 0:00:06.158180
result: 0.6532686965738749, C = 0.01, time: 0:00:08.765943
result: 0.6533839200548215, C = 0.1, time: 0:00:08.237208
result: 0.6534886520646228, C = 1.0, time: 0:00:09.771485
result: 0.6537385974160473, C = 10.0, time: 0:00:09.949043
result: 0.6536109792986619, C = 100.0, time: 0:00:09.770117
result: 0.6540144487029924, C = 1000.0, time: 0:00:09.775766


In [8]:
X_scaled = X.drop(['lobby_type', 'r1_hero','r2_hero','r3_hero','r4_hero','r5_hero','d1_hero','d2_hero','d3_hero','d4_hero','d5_hero'], axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_scaled)
for coef in np.geomspace(0.0001, 1000, 8):
    LRclf = LogisticRegression(penalty = 'l2', C = coef)
    start_time = datetime.datetime.now()
    res = np.mean(cross_val_score(estimator = LRclf, X = X_scaled, y = y, cv = kf, scoring = make_scorer(roc_auc_score)))
    print("result: {}, C = {}, time: {}".format(res, coef, datetime.datetime.now() - start_time))

result: 0.6495311766872511, C = 0.0001, time: 0:00:02.889119
result: 0.6530670133354406, C = 0.001, time: 0:00:06.056470
result: 0.6536587971535308, C = 0.01, time: 0:00:08.628171
result: 0.6536476848571627, C = 0.1, time: 0:00:08.947779
result: 0.6535426330015672, C = 1.0, time: 0:00:07.634308
result: 0.6537262746408554, C = 10.0, time: 0:00:07.546420
result: 0.6534482667231354, C = 100.0, time: 0:00:08.926855
result: 0.6535430302562271, C = 1000.0, time: 0:00:08.754776


При исключении категориальных признаков из выборки качество почти не изменилось.
Сформируем "мешок слов".

In [9]:
00:08
print("Максимальный индекс героев =", heroes)
X_full = np.zeros((X.shape[0], heroes))
for i, match_id in enumerate(X.index):
    for p in range(5):
        X_full[i, X.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_full[i, X.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

X_full = np.concatenate((X_scaled, X_full), axis = 1)

Максимальный индекс героев = 112


In [10]:
LRclf = LogisticRegression(penalty = 'l2', C = 0.1, max_iter=500)
start_time = datetime.datetime.now()
res = np.mean(cross_val_score(estimator = LRclf, X = X_full, y = y, cv = kf, scoring = make_scorer(roc_auc_score)))
print("result: {}, C = {}, time {}".format(res, 0.1, datetime.datetime.now() - start_time))

result: 0.6817806383915742, C = 0.1, time 0:00:26.320256


In [11]:
X_test = pd.read_csv("features_test.csv", index_col = "match_id")
X_test = X_test.fillna(0)
X_test_scaled = X_test.drop(['lobby_type', 'r1_hero','r2_hero','r3_hero','r4_hero','r5_hero','d1_hero','d2_hero','d3_hero','d4_hero','d5_hero'], axis=1)
X_test_scaled = StandardScaler().fit_transform(X_test_scaled)
X_test_heroes = np.zeros((X_test.shape[0], 112))
for i, match_id in enumerate(X_test.index):
    for p in range(5):
        X_test_heroes[i, X_test.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_test_heroes[i, X_test.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

X_test_scaled = np.concatenate((X_test_scaled, X_test_heroes), axis = 1)

In [17]:
LRclf = LogisticRegression(penalty = 'l2', C = 0.1, max_iter=500)
LRclf.fit(X_full, y)
pred = LRclf.predict_proba(X_test_scaled)[:,1]
print(pred.min(), pred.max())


0.008576941101942013 0.9964600348392587


In [18]:
X_test["radiant_win"] = pred
X_test["radiant_win"].to_csv("result")