In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

Import the tables:

In [2]:
train_db = pd.read_csv("train_db.csv")
test_db = pd.read_csv("test_db.csv")

In [3]:
print(train_db.shape, test_db.shape)

(327828, 103) (81958, 102)


Define id column and target column names:

In [4]:
id_col = "match_id"
target_col = "radiant_win"

Define player stats we will use as features:

In [5]:
player_stat_columns = ["xp", "gold"]
player_indexes = [str(i) for i in range(1, 6)]
player_teams = ["r", "d"]

In [6]:
all_players_columns = []
for t in player_teams:
    for pi in player_indexes:
        for stat in player_stat_columns:
            all_players_columns.append(t + pi + '_' + stat)
            
print(len(all_players_columns))
print(all_players_columns[:10])

20
['r1_xp', 'r1_gold', 'r2_xp', 'r2_gold', 'r3_xp', 'r3_gold', 'r4_xp', 'r4_gold', 'r5_xp', 'r5_gold']


In [7]:
upd_train_db = train_db[[id_col, target_col] + all_players_columns]
upd_test_db = test_db[[id_col] + all_players_columns]

Let's take a look what do we got in **hero_stats**:

In [8]:
heroes_info = pd.read_json("hero_stats.json").T.reset_index(drop=True)
print(heroes_info.shape)
heroes_info.head(1)

(117, 29)


Unnamed: 0,id,name,localized_name,primary_attr,attack_type,roles,img,icon,base_health,base_health_regen,...,str_gain,agi_gain,int_gain,attack_range,projectile_speed,attack_rate,move_speed,turn_rate,cm_enabled,legs
0,1,npc_dota_hero_antimage,Anti-Mage,agi,Melee,"[Carry, Escape, Nuker]",/apps/dota2/images/heroes/antimage_full.png?,/apps/dota2/images/heroes/antimage_icon.png,200,0.25,...,1.3,3.2,1.8,150,0,1.4,310,0.5,True,2


Will ignore that for now :)

In [9]:
upd_train_db

Unnamed: 0,match_id,radiant_win,r1_xp,r1_gold,r2_xp,r2_gold,r3_xp,r3_gold,r4_xp,r4_gold,...,d1_xp,d1_gold,d2_xp,d2_gold,d3_xp,d3_gold,d4_xp,d4_gold,d5_xp,d5_gold
0,0.0,1,611.0,608.0,1900.0,1494.0,688.0,1219.0,1610.0,1869.0,...,773.0,629.0,1158.0,1056.0,889.0,1322.0,1245.0,1360.0,1924.0,1156.0
1,1.0,0,1911.0,1743.0,1242.0,1413.0,853.0,543.0,1537.0,953.0,...,984.0,760.0,1835.0,1674.0,1507.0,1282.0,1029.0,1135.0,1242.0,1022.0
2,2.0,0,886.0,1127.0,2290.0,2098.0,1150.0,1035.0,793.0,694.0,...,732.0,1035.0,1079.0,1103.0,1243.0,974.0,2097.0,1505.0,1919.0,1592.0
3,3.0,1,1539.0,669.0,2638.0,1750.0,1138.0,1202.0,836.0,1046.0,...,1177.0,1225.0,1250.0,1023.0,1398.0,788.0,903.0,876.0,788.0,932.0
4,4.0,1,1947.0,1378.0,494.0,541.0,951.0,989.0,1367.0,790.0,...,1398.0,620.0,1892.0,1157.0,896.0,551.0,1290.0,1116.0,862.0,1026.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327823,409781.0,0,1144.0,1312.0,743.0,631.0,929.0,872.0,1973.0,1666.0,...,1633.0,1201.0,824.0,908.0,1067.0,1585.0,966.0,1019.0,2515.0,2179.0
327824,409782.0,1,1074.0,598.0,884.0,871.0,838.0,724.0,1612.0,1967.0,...,1320.0,749.0,1442.0,1206.0,1713.0,1797.0,635.0,841.0,346.0,756.0
327825,409783.0,1,1065.0,1451.0,1049.0,1113.0,1962.0,1289.0,1945.0,1723.0,...,1154.0,967.0,1625.0,1217.0,514.0,623.0,2110.0,1311.0,821.0,1067.0
327826,409784.0,1,1140.0,880.0,1228.0,1001.0,1190.0,1591.0,1693.0,1393.0,...,1873.0,1198.0,1457.0,970.0,899.0,1166.0,940.0,624.0,546.0,672.0


Prepare to sum all players gold and exp:

In [10]:
columns_to_process = ["xp", "gold"]
radiant_player_prefixes = ["r" + str(i) + "_" for i in range(1, 6)]
dire_player_prefixes = ["d" + str(i) + "_" for i in range(1, 6)]

In [11]:
upd_train_db = train_db[[id_col, target_col] + all_players_columns]
upd_test_db = test_db[[id_col] + all_players_columns]

Sum them and store as new features:

In [12]:
used_columns = []
pd.options.mode.chained_assignment = None

for db in [upd_train_db, upd_test_db]:
    for c in columns_to_process:
        radiant_columns = [pr + c for pr in radiant_player_prefixes]
        dire_columns = [pr + c for pr in dire_player_prefixes]
        
        db["r_" + c + "_sum"] = db[radiant_columns].sum(axis=1)
        db["d_" + c + "_sum"] = db[dire_columns].sum(axis=1)

        used_columns += radiant_columns + dire_columns


    db.drop(columns=used_columns, inplace=True)

Some small preprocessing:

In [13]:
upd_train_db.fillna(0, inplace=True)
upd_test_db.fillna(0, inplace=True)

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
stsc = StandardScaler()

In [16]:
X_train = upd_train_db.drop(columns=[target_col]).values
Y_train = upd_train_db[target_col].values.reshape(-1)

X_test = upd_test_db.values

X_train = stsc.fit_transform(X_train)
X_test = stsc.fit_transform(X_test)

In [17]:
X_train.shape, X_test.shape

((327828, 5), (81958, 5))

Import the model and metrics:

In [18]:
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

Estimate the score:

In [19]:
lr = LogisticRegression(penalty="l2", n_jobs=-1)

print(np.mean(cross_val_score(lr, X_train, Y_train, cv=5, scoring='roc_auc', n_jobs=-1)))

0.6510500360775561


Train on the whole dataset:

In [20]:
lr.fit(X_train, Y_train)

LogisticRegression(n_jobs=-1)

Make prediction and dump it to the file:

In [21]:
predictions = lr.predict_proba(X_test)[:, 1]

submit_file = upd_test_db[[id_col]].copy()
submit_file.loc[:, target_col] = predictions
submit_file.to_csv("baseline_submission.csv", index=None)

Upload the file to kaggle.com and observe the score.