In [1]:
import pandas
import numpy as np
import math
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import datetime

train = pandas.read_csv('features.csv', index_col='match_id',sep=',')
test = pandas.read_csv('features_test.csv', index_col='match_id',sep=',')
train.head()
test.head()

# Удаление итогов игры
train.drop(['duration', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire']
           , axis=1, inplace=True)

# Целевая переменная
y_train = train['radiant_win']

# Переменные для обучения и тест
del train['radiant_win']

# Пропуски
print(train.info())
print(train.count())
f = open('text.txt', 'w')
for index in train.count():
    f.write(str(index) + '\n')
f.close()

#Заполнение 0 пропущенных значений
X_train_0 = train.fillna(0)
X_test_0 = test.fillna(0)

#Заполнение пропущенных значений средним
X_train_mean = train.fillna(train.mean(axis=0))
X_test_mean = test.fillna(test.mean(axis=0))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97230 entries, 0 to 114406
Columns: 102 entries, start_time to dire_first_ward_time
dtypes: float64(12), int64(90)
memory usage: 76.4 MB
None
start_time                  97230
lobby_type                  97230
r1_hero                     97230
r1_level                    97230
r1_xp                       97230
                            ...  
dire_tpscroll_count         97230
dire_boots_count            97230
dire_ward_observer_count    97230
dire_ward_sentry_count      97230
dire_first_ward_time        95404
Length: 102, dtype: int64


## 1. Градиентный бустинг над деревьями на имеющейся матрице "объекты-признаки"
### 1.1 Заполнение пропусков нулем

In [2]:
# Разделение выборки для кроссвалидации
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores_0 = []
# Кол-ва деревьев для модели
nums = [10, 20, 30, 50, 90, 210]
for n in nums:
    print('Trees: ', str(n))
    model = GradientBoostingClassifier(n_estimators=n, random_state=42)
    start_time = datetime.datetime.now()
    model_scores_0 = cross_val_score(model, X_train_0, y_train, cv=cv, scoring='roc_auc')
    print('Time spent:', datetime.datetime.now() - start_time)
    print(model_scores_0)
    scores_0.append(np.mean(model_scores_0))
    print('')
print('')
print('scores_log')
print (np.round(scores_0,2))

Trees:  10
Time spent: 0:00:34.988935
[0.66383799 0.66635457 0.66360048 0.66529818 0.66516222]

Trees:  20
Time spent: 0:01:09.393350
[0.68083889 0.68272733 0.67969876 0.6834932  0.6855512 ]

Trees:  30
Time spent: 0:01:43.197012
[0.68892093 0.68934663 0.68712298 0.69180598 0.69283583]

Trees:  50
Time spent: 0:02:53.971739
[0.69627399 0.69747879 0.69470891 0.69921915 0.69979097]

Trees:  90
Time spent: 0:05:17.220051
[0.70428355 0.70489618 0.70314489 0.70636257 0.70710108]

Trees:  210
Time spent: 0:12:06.620521
[0.71280325 0.71332323 0.71319499 0.71585593 0.71541683]


scores_log
[0.66 0.68 0.69 0.7  0.71 0.71]


### 1.2 Заполнение пропусков средним

In [3]:
cv_mean = KFold(n_splits=5, shuffle=True, random_state=42)
scores_mean = []
for n in nums:
    print('Trees: ', str(n))
    model_2 = GradientBoostingClassifier(n_estimators=n, random_state=42)
    start_time = datetime.datetime.now()
    model_scores_mean = cross_val_score(model_2, X_train_mean, y_train, cv=cv_mean, scoring='roc_auc')
    print('Time spent:', datetime.datetime.now() - start_time)
    print(model_scores_mean)
    scores_mean.append(np.mean(model_scores_mean))
    print('')
print('')
print('scores_log')
print (np.round(scores_mean,2))

Trees:  10
Time spent: 0:00:35.417463
[0.6628393  0.6690325  0.65778594 0.66141503 0.66504144]

Trees:  20
Time spent: 0:01:10.034536
[0.6809074  0.68456697 0.67803313 0.68191302 0.68453627]

Trees:  30
Time spent: 0:01:45.243503
[0.68679836 0.69049064 0.68466605 0.68999469 0.69272758]

Trees:  50
Time spent: 0:02:53.542708
[0.69548905 0.69712892 0.69344154 0.69845041 0.70048886]

Trees:  90
Time spent: 0:05:11.545837
[0.70276518 0.70472837 0.70250238 0.70626657 0.70825045]

Trees:  210
Time spent: 0:11:59.121584
[0.71262915 0.71314172 0.71273406 0.71571663 0.71633667]


scores_log
[0.66 0.68 0.69 0.7  0.7  0.71]


## 2. Логистическая регрессия

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
X_train_log = StandardScaler().fit_transform(train.fillna(0))
X_test_log = test.fillna(0)
cv_log = KFold(n_splits=5, shuffle=True, random_state=42)

c_use=[0.00001,0.0001,0.001,0.01,0.1,1,10,100]
scores_log=[]
for n in c_use:
    print('Parametr C: ', str(n))
    model_log = LogisticRegression(C=n, random_state=42)
    start_time = datetime.datetime.now()
    model_scores_log = cross_val_score(model_log, X_train_log, y_train, cv=cv_log, scoring='roc_auc', n_jobs=-1)
    print('Time spent:', datetime.datetime.now() - start_time)
    print(model_scores_log)
    scores_log.append(np.mean(model_scores_log))
    print('')
print('')
print('scores_log')
print (np.round(scores_log,6))
max_score = max(scores_log)
max_score_index = scores_log.index(max_score)
print('C: ', c_use[max_score_index], 'score: ', max_score)


Parametr C:  1e-05
Time spent: 0:00:00.903834
[0.69314159 0.6948168  0.69569824 0.69513759 0.6969822 ]

Parametr C:  0.0001
Time spent: 0:00:00.836017
[0.70954847 0.71039283 0.71169451 0.71176631 0.71335968]

Parametr C:  0.001
Time spent: 0:00:01.190113
[0.71449309 0.71577054 0.71625906 0.71697421 0.71831839]

Parametr C:  0.01
Time spent: 0:00:01.504782
[0.71464464 0.71617921 0.71624672 0.71735016 0.7183287 ]

Parametr C:  0.1
Time spent: 0:00:01.530984
[0.71461975 0.71617653 0.71619111 0.71737604 0.71827129]

Parametr C:  1
Time spent: 0:00:01.626288
[0.71461609 0.71617193 0.71618261 0.71737575 0.71826417]

Parametr C:  10
Time spent: 0:00:01.583365
[0.71461633 0.71617159 0.7161817  0.71737581 0.71826403]

Parametr C:  100
Time spent: 0:00:01.696478
[0.71461604 0.71617162 0.71618159 0.7173758  0.718264  ]


scores_log
[0.695155 0.711352 0.716363 0.71655  0.716527 0.716522 0.716522 0.716522]
C:  0.01 score:  0.7165498862352037


In [4]:
# Выборка для обучения
X_train = train.fillna(0)
del X_train['lobby_type']
for n in range(1, 6):
    del X_train['r{}_hero'.format(n)]
    del X_train['d{}_hero'.format(n)]
scaler = StandardScaler()
X_train_log_top = scaler.fit_transform(X_train)

# Выборка для теста
X_test = test.fillna(0)
del X_test['lobby_type']
for n in range(1, 6):
    del X_test['r{}_hero'.format(n)]
    del X_test['d{}_hero'.format(n)]
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

cv_kf = KFold(n_splits=5, shuffle=True, random_state=42)

c_use=[0.01,0.1,1,10,100]
scores_log_cl=[]
for n in c_use:
    print('Parametr C: ', str(n))
    model_log_cl = LogisticRegression(C=n, random_state=42)
    start_time = datetime.datetime.now()
    model_scores_log_cl = cross_val_score(model_log_cl, X_train_log_top, y_train, cv=cv_kf, scoring='roc_auc', n_jobs=-1)
    print('Time spent:', datetime.datetime.now() - start_time)
    print(model_scores_log_cl)
    scores_log_cl.append(np.mean(model_scores_log_cl))
    print('')
print('')
print('scores_log_cl:')
print (scores_log_cl)

max_score = max(scores_log_cl)
max_score_index = scores_log_cl.index(max_score)
print('C: ', c_use[max_score_index], 'score: ', max_score)


Parametr C:  0.01
Time spent: 0:00:01.550043
[0.7145036  0.71621974 0.71627984 0.71725108 0.71854174]

Parametr C:  0.1
Time spent: 0:00:01.493833
[0.71448111 0.71620877 0.71622672 0.71726946 0.71848302]

Parametr C:  1
Time spent: 0:00:01.478214
[0.71447746 0.71620674 0.71621937 0.71727383 0.71847432]

Parametr C:  10
Time spent: 0:00:01.398679
[0.71447697 0.71620771 0.71621935 0.71727417 0.71847447]

Parametr C:  100
Time spent: 0:00:01.662047
[0.71447697 0.71620728 0.71621932 0.71727431 0.71847421]


scores_log_cl:
[0.7165592000076536, 0.7165338144534041, 0.7165303443778955, 0.7165305328738455, 0.7165304163651]
C:  0.01 score:  0.7165592000076536


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

heroes = pandas.read_csv('heroes.csv')
print('Всего героев в игре:', len(heroes))
X_train = train.fillna(0)

# N — количество различных героев в выборке
X_pick = np.zeros((X_train.shape[0], len(heroes)))

for i, match_id in enumerate(X_train.index):
    for p in range(5):
        X_pick[i, X_train.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, X_train.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1


X_hero = pandas.DataFrame(X_pick, index=X_train.index)

X_test = test.fillna(0)
X_pick = np.zeros((X_test.shape[0], len(heroes)))
for i, match_id in enumerate(X_test.index):
    for p in range(5):
        X_pick[i, X_test.loc[match_id, 'r%d_hero' % (p + 1)] - 1] = 1
        X_pick[i, X_test.loc[match_id, 'd%d_hero' % (p + 1)] - 1] = -1

X_test_hero = pandas.DataFrame(X_pick, index=X_test.index)

scaler = StandardScaler()
X_train = pandas.DataFrame(scaler.fit_transform(X_train), index=X_train.index)
X_test = pandas.DataFrame(scaler.transform(X_test), index=X_test.index)

X_train_new = pandas.concat([X_train, X_hero], axis=1)
X_test_new = pandas.concat([X_test, X_test_hero], axis=1)

cv_kf_new = KFold(n_splits=5, shuffle=True, random_state=42)

c_use=[0.00001,0.0001,0.001,0.01,0.1]
scores_log_cl_new=[]
m_r_scrs=[]
for n in c_use:
    start_time = datetime.datetime.now()
    print('C =', str(n))
    model_log_cl_new = LogisticRegression(C=n, random_state=42)
    model_scores_new = cross_val_score(model_log_cl_new, X_train_new, y_train, cv=cv_kf_new, scoring='roc_auc', n_jobs=-1)
    print(model_scores_new)
    print('Time spent ', datetime.datetime.now() - start_time)
    scores_log_cl_new.append(np.mean(model_scores_new))
    print('')
    model_log_cl_new.fit(X_train_new,y_train)
    modeling_radiant=model_log_cl_new.predict_proba(X_test_new)
    print('modeling_radiant')
    print(modeling_radiant)
    m_r_scrs.append(modeling_radiant)
    print('')
print('')
max_score_new = max(scores_log_cl_new)
max_score_index_new = scores_log_cl_new.index(max_score_new)
print('C: ', c_use[max_score_index_new], 'score_max: ', max_score_new)
min_score_new = min(scores_log_cl_new)
min_score_index_new = scores_log_cl_new.index(min_score_new)
print('C: ', c_use[min_score_index_new], 'score_min: ', min_score_new)



Всего героев в игре: 112
C = 1e-05
[0.69728252 0.69882339 0.69957651 0.69932984 0.70103521]
Time spent  0:00:02.757242

modeling_radiant
[[0.49598206 0.50401794]
 [0.44759084 0.55240916]
 [0.54545176 0.45454824]
 ...
 [0.59778068 0.40221932]
 [0.54144285 0.45855715]
 [0.45560663 0.54439337]]

C = 0.0001
[0.72365596 0.72407344 0.72458071 0.72596336 0.72684296]
Time spent  0:00:02.449781

modeling_radiant
[[0.44036783 0.55963217]
 [0.3791194  0.6208806 ]
 [0.62559981 0.37440019]
 ...
 [0.71166352 0.28833648]
 [0.56233839 0.43766161]
 [0.45040777 0.54959223]]

C = 0.001
[0.74472246 0.74668392 0.74398016 0.74886659 0.74736764]
Time spent  0:00:03.462358

modeling_radiant
[[0.26612746 0.73387254]
 [0.30892166 0.69107834]
 [0.74264037 0.25735963]
 ...
 [0.77034    0.22966   ]
 [0.49649748 0.50350252]
 [0.51054876 0.48945124]]

C = 0.01
[0.74947872 0.75274052 0.74909741 0.75544233 0.75171293]
Time spent  0:00:04.894576

modeling_radiant
[[0.18664013 0.81335987]
 [0.25709057 0.74290943]
 [0.79

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
