In [3]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

PATH_TO_DATA = Path('../data')
N_ESTIMATORS = 1500
SEED = 42

In [4]:
y_train = pd.read_csv(PATH_TO_DATA / 'train_targets.csv', index_col='match_id_hash')['radiant_win']
y_train = y_train.map({True: 1, False: 0})
train_df = pd.read_csv(PATH_TO_DATA / 'train_features.csv', index_col='match_id_hash')
test_df = pd.read_csv(PATH_TO_DATA / 'test_features.csv', index_col='match_id_hash')
full_df = pd.concat([train_df, test_df], sort=False)
train_size = train_df.shape[0]

In [5]:
hero_columns = [c for c in full_df.columns if '_hero_' in c]
full_df = full_df[hero_columns]
full_df.head()

Unnamed: 0_level_0,r1_hero_id,r2_hero_id,r3_hero_id,r4_hero_id,r5_hero_id,d1_hero_id,d2_hero_id,d3_hero_id,d4_hero_id,d5_hero_id
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
a400b8f29dece5f4d266f49f1ae2e98a,11,78,14,59,77,12,21,60,84,34
b9c57c450ce74a2af79c9ce96fac144d,15,96,27,63,89,58,14,1,56,92
6db558535151ea18ca70a6892197db41,101,51,44,49,53,18,67,47,40,17
46a0ddce8f7ed2a8d9bd5edcbb925682,14,99,101,26,41,18,98,8,69,86
b1b35ff97723d9b7ade1c9c3cf48f770,42,69,27,104,65,23,22,35,72,1


In [7]:
full_df = full_df.astype(str)
train_df = full_df.iloc[:train_size, :]
test_df = full_df.iloc[train_size:, :]


In [8]:
for team in 'r', 'd':
    players = [f'{team}{i}' for i in range(1, 6)]
    hero_columns = [f'{player}_hero_id' for player in players]
    d = pd.get_dummies(full_df[hero_columns[0]])
    for c in hero_columns[1:]:
        d += pd.get_dummies(full_df[c])
    full_df = pd.concat([full_df, d.add_prefix(f'{team}_hero_')], axis=1)
    full_df.drop(columns=hero_columns, inplace=True)
    
train_df = full_df.iloc[:train_size, :]
test_df = full_df.iloc[train_size:, :]

In [9]:
train_df.head()

Unnamed: 0_level_0,r_hero_1,r_hero_10,r_hero_100,r_hero_101,r_hero_102,r_hero_103,r_hero_104,r_hero_105,r_hero_106,r_hero_107,...,d_hero_90,d_hero_91,d_hero_92,d_hero_93,d_hero_94,d_hero_95,d_hero_96,d_hero_97,d_hero_98,d_hero_99
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
b9c57c450ce74a2af79c9ce96fac144d,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6db558535151ea18ca70a6892197db41,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46a0ddce8f7ed2a8d9bd5edcbb925682,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
b1b35ff97723d9b7ade1c9c3cf48f770,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
full_df.head()

Unnamed: 0_level_0,r_hero_1,r_hero_10,r_hero_100,r_hero_101,r_hero_102,r_hero_103,r_hero_104,r_hero_105,r_hero_106,r_hero_107,...,d_hero_90,d_hero_91,d_hero_92,d_hero_93,d_hero_94,d_hero_95,d_hero_96,d_hero_97,d_hero_98,d_hero_99
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
b9c57c450ce74a2af79c9ce96fac144d,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6db558535151ea18ca70a6892197db41,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46a0ddce8f7ed2a8d9bd5edcbb925682,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
b1b35ff97723d9b7ade1c9c3cf48f770,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
full_df.to_csv('hero_names.csv')