In [1]:
# LB Score = 0.83793

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler

In [3]:
df_train_features = pd.read_csv('data/train_features.csv', index_col='match_id_hash')
df_test_features = pd.read_csv('data/test_features.csv', index_col='match_id_hash')

In [4]:
df_targets = pd.read_csv('data/train_targets.csv')

In [5]:
split_index = df_train_features.shape[0]

In [6]:
df = pd.concat([df_train_features, df_test_features])

In [7]:
df_targets.head()

Unnamed: 0,match_id_hash,game_time,radiant_win,duration,time_remaining,next_roshan_team
0,a400b8f29dece5f4d266f49f1ae2e98a,155,False,992,837,
1,b9c57c450ce74a2af79c9ce96fac144d,658,True,1154,496,
2,6db558535151ea18ca70a6892197db41,21,True,1503,1482,Radiant
3,46a0ddce8f7ed2a8d9bd5edcbb925682,576,True,1952,1376,
4,b1b35ff97723d9b7ade1c9c3cf48f770,453,False,2001,1548,


In [8]:
y_train = df_targets['radiant_win']

In [9]:
new_features = pd.DataFrame()

In [10]:
r_kills_columns = [col for col in df if col.endswith('kills')][:5]
d_kills_columns = [col for col in df if col.endswith('kills')][5:]
new_features['r_kills_mean'] = df[r_kills_columns].mean(axis = 1)
new_features['r_kills_std'] = df[r_kills_columns].std(axis = 1)
new_features['r_kills_total'] = df[r_kills_columns].sum(axis = 1)
new_features['d_kills_mean'] = df[d_kills_columns].mean(axis = 1)
new_features['d_kills_std'] = df[d_kills_columns].std(axis = 1)
new_features['d_kills_total'] = df[d_kills_columns].sum(axis = 1)
new_features['kills_diff'] = new_features['d_kills_total'] - new_features['r_kills_total']

In [11]:
df.shape

(49675, 245)

In [12]:
r_gold_columns = [col for col in df if col.endswith('gold')][:5]
d_gold_columns = [col for col in df if col.endswith('gold')][5:]
new_features['r_gold_mean'] = df[r_gold_columns].mean(axis = 1)
new_features['r_gold_std'] = df[r_gold_columns].std(axis = 1)
new_features['r_gold_total'] = df[r_gold_columns].sum(axis = 1)
new_features['d_gold_mean'] = df[d_gold_columns].mean(axis = 1)
new_features['d_gold_std'] = df[d_gold_columns].std(axis = 1)
new_features['d_gold_total'] = df[d_gold_columns].sum(axis = 1)
new_features['gold_prop'] = new_features['d_gold_total'] / new_features['r_gold_total']
new_features['gold_prop1'] = new_features['r_gold_total'] / new_features['d_gold_total']

In [13]:
r_stuns_columns = [col for col in df if col.endswith('stuns')][:5]
d_stuns_columns = [col for col in df if col.endswith('stuns')][5:]

In [14]:
new_features['r_stuns_total'] = df[r_stuns_columns].sum(axis = 1) + 1
new_features['d_stuns_total'] = df[d_stuns_columns].sum(axis = 1) + 1


In [15]:
r_max_health_columns = [col for col in df if col.endswith('max_health')][:5]
d_max_health_columns = [col for col in df if col.endswith('max_health')][5:]
new_features['d_max_health_total'] = df[r_max_health_columns].sum(axis = 1)
new_features['r_max_health_total'] = df[d_max_health_columns].sum(axis = 1)
new_features['max_health_prop'] = new_features['r_max_health_total'] / new_features['d_max_health_total']


In [16]:
r_max_mana_columns = [col for col in df if col.endswith('max_mana')][:5]
d_max_mana_columns = [col for col in df if col.endswith('max_mana')][5:]
new_features['d_max_mana_total'] = df[r_max_mana_columns].sum(axis = 1)
new_features['r_max_mana_total'] = df[d_max_mana_columns].sum(axis = 1)
new_features['max_mana_prop'] = new_features['r_max_mana_total'] / new_features['d_max_mana_total']

In [17]:
new_features['d_xp_total'] = df[[col for col in df if col.endswith('xp')][:5]].sum(axis = 1) + 1
new_features['r_xp_total'] = df[[col for col in df if col.endswith('xp')][5:]].sum(axis = 1) + 1

In [18]:
new_features['d_xp_std'] = df[[col for col in df if col.endswith('xp')][:5]].std(axis = 1)
new_features['r_xp_std'] = df[[col for col in df if col.endswith('xp')][5:]].std(axis = 1)

In [19]:
new_features['xp_prop'] = new_features['d_xp_total'] / new_features['r_xp_total']

In [20]:
new_features['d_obs_total'] = df[[col for col in df if col.endswith('obs_placed')][:5]].sum(axis = 1)
new_features['r_obs_total'] = df[[col for col in df if col.endswith('obs_placed')][5:]].sum(axis = 1)
#new_features['obs_prop'] = new_features['d_obs_total']/ new_features['r_obs_total']

In [21]:
r_towers_killed_columns = [col for col in df if col.endswith('towers_killed')][:5]
d_towers_killed_columns = [col for col in df if col.endswith('towers_killed')][5:]
new_features['d_towers'] = df[d_towers_killed_columns].sum(axis = 1) + 1
new_features['r_towers'] = df[r_towers_killed_columns].sum(axis = 1) + 1
new_features['towers_prop'] = new_features['d_towers'] / new_features['r_towers']

In [22]:
r_level_columns = [col for col in df if col.endswith('level')][:5]
d_level_columns = [col for col in df if col.endswith('level')][5:]
new_features['r_max_level'] = df[r_level_columns].max(axis = 1)
new_features['d_max_level'] = df[d_level_columns].max(axis = 1)
new_features['max_level_prop'] = new_features['d_max_level'] / new_features['r_max_level']

In [23]:
new_features['r_mean_level'] = df[r_level_columns].mean(axis = 1)
new_features['d_mean_level'] = df[d_level_columns].mean(axis = 1)
#new_features['mean_level_prop'] = new_features['r_mean_level'] / new_features['d_mean_level']

In [24]:
new_features['r_min_level'] = df[r_level_columns].min(axis = 1)
new_features['d_min_level'] = df[d_level_columns].min(axis = 1)
#new_features['min_level_prop'] = new_features['r_min_level'] / new_features['d_min_level']

In [25]:
r_roshans_killes_columns = [col for col in df if col.endswith('roshans_killed')][:5]
d_roshans_killes_columns = [col for col in df if col.endswith('roshans_killed')][5:]
new_features['r_roshans'] = df[r_roshans_killes_columns].sum(axis = 1) + 1
new_features['d_roshans'] = df[d_roshans_killes_columns].sum(axis = 1) + 1
#new_features['roshans_prop'] = new_features['r_roshans'] / new_features['d_roshans']

In [26]:
r_x_columns = [col for col in df if col.endswith('x')][:5]
d_x_columns = [col for col in df if col.endswith('x')][5:]

In [27]:
new_features['r_x_mean'] = df[r_x_columns].mean(axis = 1)
new_features['d_x_mean'] = df[d_x_columns].mean(axis = 1)
new_features['r_x_max'] = df[r_x_columns].max(axis = 1)
new_features['d_x_max'] = df[d_x_columns].max(axis = 1)
new_features['r_x_min'] = df[r_x_columns].min(axis = 1)
new_features['d_x_min'] = df[d_x_columns].min(axis = 1)

In [28]:
r_y_columns = [col for col in df if col.endswith('y')][:5]
d_y_columns = [col for col in df if col.endswith('y')][5:]

In [29]:
new_features['r_y_mean'] = df[r_y_columns].mean(axis = 1)
new_features['d_y_mean'] = df[d_y_columns].mean(axis = 1)

In [30]:
new_features['r_y_max'] = df[r_y_columns].max(axis = 1)
new_features['d_y_max'] = df[d_y_columns].max(axis = 1)
new_features['r_y_min'] = df[r_y_columns].min(axis = 1)
new_features['d_y_min'] = df[d_y_columns].min(axis = 1)

In [31]:
# coordinates of radiant start point
r_x_min = df[['r1_x', 'r2_x','r3_x','r4_x','r5_x']].min(axis = 1)
r_y_min = df[['r1_y', 'r2_y','r3_y','r4_y','r5_y']].min(axis = 1)

In [32]:
r_x_max = df[['r1_x', 'r2_x','r3_x','r4_x','r5_x']].max(axis = 1)
r_y_max = df[['r1_y', 'r2_y','r3_y','r4_y','r5_y']].max(axis = 1)

In [33]:
new_features['r1_dist'] = np.sqrt((df['r1_y'] - r_y_min)**2 + (df['r1_x'] - r_x_min)**2)
new_features['r2_dist'] = np.sqrt((df['r2_y'] - r_y_min)**2 + (df['r2_x'] - r_x_min)**2)
new_features['r3_dist'] = np.sqrt((df['r3_y'] - r_y_min)**2 + (df['r3_x'] - r_x_min)**2)
new_features['r4_dist'] = np.sqrt((df['r4_y'] - r_y_min)**2 + (df['r4_x'] - r_x_min)**2)
new_features['r5_dist'] = np.sqrt((df['r5_y'] - r_y_min)**2 + (df['r5_x'] - r_x_min)**2)

In [34]:
r_avg_dist = new_features[['r1_dist', 'r2_dist', 'r3_dist', 'r4_dist', 'r5_dist']].mean(axis = 1)

In [35]:
new_features['d1_dist'] = np.sqrt((df['d1_y'] - r_y_max)**2 + (df['d1_x'] - r_x_max)**2)
new_features['d2_dist'] = np.sqrt((df['d2_y'] - r_y_max)**2 + (df['d2_x'] - r_x_max)**2)
new_features['d3_dist'] = np.sqrt((df['d3_y'] - r_y_max)**2 + (df['d3_x'] - r_x_max)**2)
new_features['d4_dist'] = np.sqrt((df['d4_y'] - r_y_max)**2 + (df['d4_x'] - r_x_max)**2)
new_features['d5_dist'] = np.sqrt((df['d5_y'] - r_y_max)**2 + (df['d5_x'] - r_x_max)**2)

In [36]:
d_avg_dist = new_features[['d1_dist', 'd2_dist', 'd3_dist', 'd4_dist', 'd5_dist']].mean(axis = 1)

In [37]:
new_features['dist_prop'] =  d_avg_dist

In [38]:
#r_runes = df[[col for col in df if col.endswith('rune_pickups')][:5]].sum(axis = 1) + 1
#d_runes = df[[col for col in df if col.endswith('rune_pickups')][5:]].sum(axis = 1) + 1
#new_features['runes_prop'] = d_runes / r_runes

In [39]:
df.head()

Unnamed: 0_level_0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,r1_denies,...,d5_stuns,d5_creeps_stacked,d5_camps_stacked,d5_rune_pickups,d5_firstblood_claimed,d5_teamfight_participation,d5_towers_killed,d5_roshans_killed,d5_obs_placed,d5_sen_placed
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,155,22,7,1,11,11,0,0,0,0,...,0.0,0,0,0,0,0.0,0,0,0,0
b9c57c450ce74a2af79c9ce96fac144d,658,4,0,3,10,15,7,2,0,7,...,0.0,0,0,0,0,0.0,0,0,0,0
6db558535151ea18ca70a6892197db41,21,23,0,0,0,101,0,0,0,0,...,0.0,0,0,0,0,0.0,0,0,0,0
46a0ddce8f7ed2a8d9bd5edcbb925682,576,22,7,1,4,14,1,0,3,1,...,8.664527,3,1,3,0,0.0,0,0,2,0
b1b35ff97723d9b7ade1c9c3cf48f770,453,22,7,1,3,42,0,1,1,0,...,0.0,2,1,2,0,0.25,0,0,0,0


In [40]:
#columns = df.columns[-53:]

In [41]:
new_features = new_features.drop(['r1_dist', 'r2_dist', 'r3_dist','r4_dist','r5_dist',
                                  'd1_dist', 'd2_dist', 'd3_dist','d4_dist','d5_dist',
                                  'd_xp_total'], axis = 1)

In [42]:
X_train = new_features[:split_index]

In [43]:
X_test = new_features[split_index:]

In [44]:
y_train = df_targets['radiant_win'] * 1

In [45]:
X_train.shape, y_train.shape

((39675, 54), (39675,))

In [46]:
rf = RandomForestClassifier(n_estimators=100, max_depth=15, min_samples_leaf = 5, random_state=17)

In [47]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=17, verbose=0,
                       warm_start=False)

In [48]:
cross_val_score(rf, X_train, y_train, cv = 5, n_jobs = -1, scoring='roc_auc').mean()

0.8189803655571639

In [49]:
# 0.8189803655571639 - max level prop

In [50]:
# 0.8185534881414543 - max tower prop

In [51]:
# 0.8184250096324922 - max mana prop

In [52]:
# 0.8182711584824215

In [53]:
# 0.8184619965576416 - max health proportion

In [54]:
# 0.8181779266124078 - xp_gold removed

In [55]:
# 0.8180850738287315 - xp_proportion

In [56]:
#0.8138925440421548 - d_dist

In [57]:
# 0.8124434204241302

In [58]:
def prepare_submit(model, x_test, file_name = 'submission'):
    y_test_pred= model.predict_proba(X_test)[:, 1]
    df_submission = pd.DataFrame({'radiant_win_prob': y_test_pred}, 
                                 index=df_test_features.index)
    df_submission.to_csv('first_pred_all_data.csv')

KeyError: 'r1_'