In [14]:
# First attempt at logistic regression, step by step.

import pandas as pd
import numpy as np
import os
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [15]:
df = pd.read_csv('../../../../GitHub_Data/Dota2Matches/match_outcomes.csv')
df.head()

Unnamed: 0,match_id,account_id_0,account_id_1,account_id_2,account_id_3,account_id_4,start_time,parser_version,win,rad
0,1636204962,34549,0,0,-51743434,-120875154,1437014585,12,1,0
1,1636204962,0,61598,138825,0,207232,1437014585,12,0,1
2,1636322679,0,-44943233,-240360907,19599,0,1437019968,12,0,0
3,1636322679,-97530201,0,0,0,-116349387,1437019968,12,1,1
4,1637385965,0,0,0,104738,0,1437052551,12,1,0


In [16]:
# Messy. Let's just work with player info. drop parser_version and start_time, for starters
df.drop(['parser_version', 'start_time'], axis=1, inplace=True)

In [17]:
df.columns

Index(['match_id', 'account_id_0', 'account_id_1', 'account_id_2',
       'account_id_3', 'account_id_4', 'win', 'rad'],
      dtype='object')

In [18]:
df.shape

(1828588, 8)

In [19]:
df = df.melt(id_vars=['match_id','win','rad'], \
    value_vars=['account_id_0', 'account_id_1', 'account_id_2', 'account_id_3', 'account_id_4'], \
        var_name='PlayerNumber', value_name='PlayerAccountID')

df.head()

Unnamed: 0,match_id,win,rad,PlayerNumber,PlayerAccountID
0,1636204962,1,0,account_id_0,34549
1,1636204962,0,1,account_id_0,0
2,1636322679,0,0,account_id_0,0
3,1636322679,1,1,account_id_0,-97530201
4,1637385965,1,0,account_id_0,0


In [20]:
dfPlayerRatings = pd.read_csv('../../../../GitHub_Data/Dota2Matches/player_ratings.csv')
dfPlayerRatings.head()

Unnamed: 0,account_id,total_wins,total_matches,trueskill_mu,trueskill_sigma
0,236579,14,24,27.868035,5.212361
1,-343,1,1,26.544163,8.065475
2,-1217,1,1,26.521103,8.114989
3,-1227,1,1,27.248025,8.092217
4,-1284,0,1,22.931016,8.092224


In [21]:
df = pd.merge(df,dfPlayerRatings,how='left', left_on='PlayerAccountID', right_on='account_id')
df.head()

Unnamed: 0,match_id,win,rad,PlayerNumber,PlayerAccountID,account_id,total_wins,total_matches,trueskill_mu,trueskill_sigma
0,1636204962,1,0,account_id_0,34549,34549,10,17,25.49682,5.27716
1,1636204962,0,1,account_id_0,0,0,1608398,3315071,25.0,8.333333
2,1636322679,0,0,account_id_0,0,0,1608398,3315071,25.0,8.333333
3,1636322679,1,1,account_id_0,-97530201,-97530201,1,1,26.880745,8.118755
4,1637385965,1,0,account_id_0,0,0,1608398,3315071,25.0,8.333333


In [22]:
# so the blank slot doesn't really mean anything. All those 0s are just empty spots where somebody dropped.
# We'll get rid of them.

df.drop(['account_id'], axis=1, inplace=True)
df = df[df['PlayerAccountID'] != 0]
df.head()

Unnamed: 0,match_id,win,rad,PlayerNumber,PlayerAccountID,total_wins,total_matches,trueskill_mu,trueskill_sigma
0,1636204962,1,0,account_id_0,34549,10,17,25.49682,5.27716
3,1636322679,1,1,account_id_0,-97530201,1,1,26.880745,8.118755
6,1637623870,1,0,account_id_0,-123447796,22,47,24.86715,3.543024
7,1637623870,0,1,account_id_0,-108454938,28,46,32.02879,3.659563
8,1637739731,0,0,account_id_0,320093,9,15,26.680175,5.869468


In [25]:
# The question becomes - how to create a 'team' from the individuals?
# For now, let's just add the trueskill_mu for each player and ignore the sigma
# why add and not average? mostly because this is just a test and i'm curious of the results

df.drop(['trueskill_sigma', 'PlayerNumber', 'PlayerAccountID', 'total_wins', 'total_matches'], axis=1, inplace=True)
df.head()

Unnamed: 0,match_id,win,rad,trueskill_mu
0,1636204962,1,0,25.49682
3,1636322679,1,1,26.880745
6,1637623870,1,0,24.86715
7,1637623870,0,1,32.02879
8,1637739731,0,0,26.680175


In [39]:
dfRad = df[df['rad'] == 1].copy()
dfDire = df[df['rad'] == 0].copy()

print(dfRad.head())
print(dfDire.head())

      match_id  win  rad  trueskill_mu
3   1636322679    1    1     26.880745
7   1637623870    0    1     32.028790
11  1637843670    0    1     29.610686
14  1638064585    0    1     22.376919
20  1638310272    0    1     21.488357
      match_id  win  rad  trueskill_mu
0   1636204962    1    0     25.496820
6   1637623870    1    0     24.867150
8   1637739731    0    0     26.680175
10  1637843670    1    0     24.473918
15  1638064585    1    0     23.776348


In [40]:
dfRad.rename(columns={'win':'Rad_Win'}, inplace = True)
dfRad.head()

Unnamed: 0,match_id,Rad_Win,rad,trueskill_mu
3,1636322679,1,1,26.880745
7,1637623870,0,1,32.02879
11,1637843670,0,1,29.610686
14,1638064585,0,1,22.376919
20,1638310272,0,1,21.488357


In [41]:
dfRad.rename(columns={'trueskill_mu': 'radiant_trueskill_mu'},inplace=True)
dfRad.drop(['rad'], axis = 1, inplace=True)

In [42]:
dfRad.head()

Unnamed: 0,match_id,Rad_Win,radiant_trueskill_mu
3,1636322679,1,26.880745
7,1637623870,0,32.02879
11,1637843670,0,29.610686
14,1638064585,0,22.376919
20,1638310272,0,21.488357


In [43]:
dfDire.rename(columns={'trueskill_mu': 'dire_trueskill_mu'},inplace=True)
dfDire.drop(['win','rad'], axis = 1, inplace=True)
dfDire.head()

Unnamed: 0,match_id,dire_trueskill_mu
0,1636204962,25.49682
6,1637623870,24.86715
8,1637739731,26.680175
10,1637843670,24.473918
15,1638064585,23.776348


In [58]:
dfPreparedData = pd.merge(dfRad,dfDire,how='inner', on='match_id')
dfPreparedData = pd.pivot_table(dfPreparedData, index=['match_id'], \
    aggfunc={'Rad_Win': np.max, 'radiant_trueskill_mu': np.sum, 'dire_trueskill_mu': np.sum})
dfPreparedData.head(10)

Unnamed: 0_level_0,Rad_Win,dire_trueskill_mu,radiant_trueskill_mu
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1636204962,0,228.219917,224.925992
1636322679,1,152.576754,175.683275
1637385965,0,57.211613,47.955511
1637623870,0,545.070286,482.454634
1637739731,1,400.457043,458.195754
1637843670,0,489.345735,482.74652
1637995977,1,286.503921,292.917945
1638064585,0,413.603454,342.660388
1638252636,0,201.927567,191.590289
1638310272,0,664.601491,580.206825


In [59]:
dfPreparedData.shape

(827280, 3)

In [None]:
# Good enough progress for the night. I want to add additional features to this dataframe in a second effort.