In [None]:
import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv('features.csv', index_col='match_id')
test = pd.read_csv('features_test.csv', index_col='match_id')

In [None]:
#-------------------Удаление лишних признаков-----------------------

y = data['radiant_win']    # Целевая переменная
data.drop('radiant_win', axis=1, inplace=True)
not_in_test = [col for col in data.columns if col not in test.columns]
data.drop(not_in_test, axis=1, inplace=True)

In [None]:
#-------------------Обработка пропущенных данных-------------------

merged_data = pd.concat([data, test])
cols_with_nan = merged_data.loc[:, merged_data.isnull().any()].columns
print(cols_with_nan)    # Признаки с пропущенными значениями

data['first_blood_team'].fillna(-1, inplace=True)
data.fillna(0, inplace=True)

test['first_blood_team'].fillna(-1, inplace=True)
test.fillna(0, inplace=True)

In [None]:
#--------------------Количество идентификаторов героев---------------------------

heroes = 0
for col in ['r1_hero','r2_hero','r3_hero','r4_hero','r5_hero','d1_hero','d2_hero','d3_hero','d4_hero','d5_hero']:
    if merged_data[col].unique().max() > heroes:
        heroes = merged_data[col].unique().max()
heroes

In [None]:
#-----------------Построение классификатора------------------------

#--------------------Градиентный бустинг------------------------

cv = KFold(n_splits=5, shuffle=True)
for k in [10, 20, 30, 40, 50]:
    print(k, 'trees')
    cls = GradientBoostingClassifier(n_estimators=k)
    start_time = datetime.datetime.now()
    cross_scores = cross_val_score(cls, data, y, cv=cv, scoring='roc_auc')
    print('Time elapsed:', datetime.datetime.now() - start_time)
    print('Mean score', np.around(cross_scores.mean(), decimals=2))
    print('Min score', np.around(cross_scores.min(), decimals=2))
    print('Max score {}\n'.format(np.around(cross_scores.max(), decimals=2)))

In [None]:
#--------------------Логистическая регрессия------------------------

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

for c in np.power(10.0, np.arange(-5, 6)):
    print('Regularization parameter', c)
    cls = LogisticRegression(C=c)
    start_time = datetime.datetime.now()
    cross_scores = cross_val_score(cls, scaled_data, y, cv=cv, scoring='roc_auc')
    print('Time elapsed:', datetime.datetime.now() - start_time)
    print('Mean score', cross_scores.mean())
    print('Min score', np.around(cross_scores.min(), decimals=2))
    print('Max score {}\n'.format(np.around(cross_scores.max(), decimals=2)))

In [98]:
#--------------------Логистическая регрессия после обработки------------------------

X_pick = np.zeros((merged_data.shape[0], heroes))
for i, match_id in enumerate(merged_data.index):
    for p in range(5):
        X_pick[i, merged_data.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, merged_data.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

data.drop(['lobby_type','r1_hero','r2_hero','r3_hero','r4_hero','r5_hero','d1_hero','d2_hero','d3_hero','d4_hero','d5_hero'], axis=1, inplace=True)
test.drop(['lobby_type','r1_hero','r2_hero','r3_hero','r4_hero','r5_hero','d1_hero','d2_hero','d3_hero','d4_hero','d5_hero'], axis=1, inplace=True)

scaled_data = np.hstack((scaled_data, X_pick[:data.shape[0]]))

scaled_test = scaler.transform(test)
scaled_test = np.hstack((scaled_test, X_pick[data.shape[0]:]))

for c in np.power(10.0, np.arange(-5, 6)):
    print('Regularization parameter', c)
    cls = LogisticRegression(C=c)
    start_time = datetime.datetime.now()
    cross_scores = cross_val_score(cls, scaled_data, y, cv=cv, scoring='roc_auc')
    print('Time elapsed:', datetime.datetime.now() - start_time)
    print('Mean score', cross_scores.mean())
    print('Min score', np.around(cross_scores.min(), decimals=2))
    print('Max score {}\n'.format(np.around(cross_scores.max(), decimals=2)))

cls = LogisticRegression(C=10)
cls.fit(scaled_data, y)
pred = cls.predict_proba(scaled_test)[:, 1]
print('Min predicton', np.around(pred.min(), decimals=3))
print('Max predicton', np.around(pred.max(), decimals=3))

Index(['first_blood_time', 'first_blood_team', 'first_blood_player1',
       'first_blood_player2', 'radiant_bottle_time', 'radiant_courier_time',
       'radiant_flying_courier_time', 'radiant_first_ward_time',
       'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time',
       'dire_first_ward_time'],
      dtype='object')
112


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Min predicton 0.009
Max predicton 0.997
