# EDA round 1
- Get a feel for the match data through descriptive statistics and visualizations

In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
# Files located: C:\Users\zange\OneDrive\Documents\GitHub_Data\Dota2Matches
print(os.listdir('../../../GitHub_Data/Dota2Matches/'))

['ability_ids.csv', 'ability_upgrades.csv', 'chat.csv', 'cluster_regions.csv', 'hero_names.csv', 'item_ids.csv', 'match.csv', 'match_outcomes.csv', 'objectives.csv', 'patch_dates.csv', 'players.csv', 'player_ratings.csv', 'player_time.csv', 'purchase_log.csv', 'teamfights.csv', 'teamfights_players.csv', 'test_labels.csv', 'test_player.csv', 'yasp_sample.json']


In [5]:
# Let's start with match_outcomes
df = pd.read_csv('../../../GitHub_Data/Dota2Matches/match_outcomes.csv')

In [6]:
df.head()

Unnamed: 0,match_id,account_id_0,account_id_1,account_id_2,account_id_3,account_id_4,start_time,parser_version,win,rad
0,1636204962,34549,0,0,-51743434,-120875154,1437014585,12,1,0
1,1636204962,0,61598,138825,0,207232,1437014585,12,0,1
2,1636322679,0,-44943233,-240360907,19599,0,1437019968,12,0,0
3,1636322679,-97530201,0,0,0,-116349387,1437019968,12,1,1
4,1637385965,0,0,0,104738,0,1437052551,12,1,0


In [7]:
# So the 'account id' belongs to individual players, 5 per side. The match will not always have all 5 players.
# From the data docs, it looks like match_outcomes.csv should be 900k+ rows. It was used to help calculate players' trueskill,
# which is an algorithm developed by Microsoft (https://www.microsoft.com/en-us/research/project/trueskill-ranking-system/) for matchmaking.
df.describe()

Unnamed: 0,match_id,account_id_0,account_id_1,account_id_2,account_id_3,account_id_4,start_time,parser_version,win,rad
count,1828588.0,1828588.0,1828588.0,1828588.0,1828588.0,1828588.0,1828588.0,1828588.0,1828588.0,1828588.0
mean,1820957000.0,-31042590.0,-31340120.0,-31334540.0,-31372900.0,-31714020.0,1443280000.0,12.73469,0.4999962,0.5
std,68199590.0,62329880.0,62142500.0,62035050.0,62178720.0,62473500.0,2375735.0,0.5869132,0.5000001,0.5
min,1636205000.0,-299194000.0,-299194000.0,-298658400.0,-298722900.0,-298584900.0,1437015000.0,12.0,0.0,0.0
25%,1794028000.0,-9770656.0,-20001270.0,-21175140.0,-20481390.0,-24419740.0,1442253000.0,12.0,0.0,0.0
50%,1830662000.0,0.0,0.0,0.0,0.0,0.0,1443583000.0,13.0,0.0,0.5
75%,1868779000.0,56417.0,54793.0,55803.0,55239.25,52339.0,1444931000.0,13.0,1.0,1.0
max,1930335000.0,330511.0,330513.0,330512.0,330511.0,330510.0,1447312000.0,14.0,1.0,1.0


In [14]:
# Looks correct. 1.8 millionish rows, with 2 rows for each match (one radiant, one dire). But there are a ton of zeros.
# How many of these rows do not have any 0s in account ids? Let's find out.
df['FiveMemberTeam'] = df['account_id_0']*df['account_id_1']*df['account_id_2']*df['account_id_3']*df['account_id_4'] # Any zeros and this will be zero
df['IsFiveMemberTeam'] = df['FiveMemberTeam'].apply(lambda x: 1 if x != 0 else 0) # Probably a one-step way to do these two lines but this works
df.drop('FiveMemberTeam',inplace=True,axis=1) # Don't need the calculation anymore
df.head()

Unnamed: 0,match_id,account_id_0,account_id_1,account_id_2,account_id_3,account_id_4,start_time,parser_version,win,rad,IsFiveMemberTeam
0,1636204962,34549,0,0,-51743434,-120875154,1437014585,12,1,0,0
1,1636204962,0,61598,138825,0,207232,1437014585,12,0,1,0
2,1636322679,0,-44943233,-240360907,19599,0,1437019968,12,0,0,0
3,1636322679,-97530201,0,0,0,-116349387,1437019968,12,1,1,0
4,1637385965,0,0,0,104738,0,1437052551,12,1,0,0


In [15]:
dfFiveMemberTeams = df.query('IsFiveMemberTeam != 0')
dfFiveMemberTeams.head()

Unnamed: 0,match_id,account_id_0,account_id_1,account_id_2,account_id_3,account_id_4,start_time,parser_version,win,rad,IsFiveMemberTeam
6,1637623870,-123447796,68408,-100048908,-16784805,320715,1437058007,12,1,0,1
10,1637843670,-142035,-63684257,72554,213175,-45490226,1437063709,12,1,0,1
20,1638310272,-192964935,-52821457,-35062517,-125451710,-86836166,1437080046,12,0,1,1
21,1638310272,-73473909,41655,-112770436,50800,50799,1437080046,12,1,0,1
28,1638542275,245558,8011,237755,-125264109,-118223194,1437093590,12,1,0,1


In [32]:
print("Rows with fewer than 5 players on a team: ",int((df.shape[0]-dfFiveMemberTeams.shape[0])))
print("Percent rows with vewer than 5 players on a team:", (df.shape[0]-dfFiveMemberTeams.shape[0])/df.shape[0])
dfFiveMemberTeams.describe()

Rows with fewer than 5 players on a team:  1442554
Percent rows with vewer than 5 players on a team: 0.7888895694382769


Unnamed: 0,match_id,account_id_0,account_id_1,account_id_2,account_id_3,account_id_4,start_time,parser_version,win,rad,IsFiveMemberTeam
count,386034.0,386034.0,386034.0,386034.0,386034.0,386034.0,386034.0,386034.0,386034.0,386034.0,386034.0
mean,1819611000.0,-38268920.0,-39262350.0,-38733650.0,-39222220.0,-40847660.0,1443234000.0,12.723076,0.519247,0.492894,1.0
std,69094790.0,64823400.0,64448620.0,64127260.0,64457050.0,65485670.0,2407179.0,0.593747,0.49963,0.49995,0.0
min,1637624000.0,-299194000.0,-299194000.0,-298307800.0,-298584900.0,-297728600.0,1437058000.0,12.0,0.0,0.0,1.0
25%,1789662000.0,-71407070.0,-76723460.0,-74183820.0,-76439340.0,-81563880.0,1442113000.0,12.0,0.0,0.0,1.0
50%,1829638000.0,31099.5,28582.0,28786.0,28877.0,26936.0,1443539000.0,13.0,1.0,0.0,1.0
75%,1869158000.0,126270.8,124977.0,123964.0,124943.8,123320.8,1444949000.0,13.0,1.0,1.0,1.0
max,1930334000.0,330511.0,330513.0,330510.0,330510.0,330510.0,1447311000.0,14.0,1.0,1.0,1.0


In [25]:
# Wow there are a ton of matches without 5 players on at least one of the teams.
# But it gets worse of course. These are just matches where one of the two teams has 5 players. What about 5v5?
# To do that, we need to trim dfFiveMemberTeams down to those with duplicate match_id
dfFiveVsFiveMatches = dfFiveMemberTeams[dfFiveMemberTeams.duplicated(['match_id'],False)]
dfFiveVsFiveMatches.head()

Unnamed: 0,match_id,account_id_0,account_id_1,account_id_2,account_id_3,account_id_4,start_time,parser_version,win,rad,IsFiveMemberTeam
20,1638310272,-192964935,-52821457,-35062517,-125451710,-86836166,1437080046,12,0,1,1
21,1638310272,-73473909,41655,-112770436,50800,50799,1437080046,12,1,0,1
86,1643354133,-55584447,65286,170795,59243,-150925818,1437258769,12,1,0,1
87,1643354133,-2844091,-80836184,-83755151,-100324587,-114283369,1437258769,12,0,1,1
102,1643773534,2859,11797,-117791787,-108452107,21215,1437279914,12,1,1,1


In [38]:
# Lost another big chunk of rows, but we're down to just the duplicated sets. 
print("Five v Five rows: ", dfFiveVsFiveMatches.shape[0])
print("Five either way rows: ", dfFiveMemberTeams.shape[0])
print("Total count of Five v Five matches: ", int(dfFiveVsFiveMatches.shape[0]/2))
print("Percentage of 5 vs. 5 matches against original set: ", np.round(((dfFiveVsFiveMatches.shape[0]/2)/(df.shape[0]/2))*100,2))

Five v Five rows:  126202
Five either way rows:  386034
Total count of Five v Five matches:  63101
Percentage of 5 vs. 5 matches against original set:  6.9


In [31]:
# 6.9? 6.9 percent? Y'all pathetic, leaving matches early. I bet most of those people who leave are on the losing side.
# Could find out by looking at the average number of players on winning teams vs. losing teams. For another time, I need to each lunch.