# Data Transformation

## Basic imports

In [1]:
import numpy as np
import pandas as pd

import ast

## Load Data

In [2]:
ti9_path = "../ti9_full.csv"
ti = pd.read_csv(ti9_path)
print(f"Shape of dataframe: {ti.shape}")
ti.sample(5)  # show 5 random entries

Shape of dataframe: (1950, 17)


Unnamed: 0,match_id,radiant_score,dire_score,radiant_xp_adv,radiant_gold_adv,rad_team_id,rad_tag,dire_team_id,dire_team_id.1,dire_tag,hero_damage,hero_healing,obs_placed,kda,player_slot,region_x,region_y
395,4969752692,25,30,"[0, -161, 180, 374, 321, 289, 875, 1243, 1805,...","[0, 300, 423, 654, 864, 830, 911, 1311, 1491, ...",726228,VG,39,39,EG,43372,0,0.0,5,dire,China,North America
513,4977207666,21,24,"[0, 12, -336, -183, -83, -232, -519, 191, -462...","[0, -440, -589, -673, -634, -627, -797, -104, ...",6209804,RNG,111474,111474,Alliance,11514,5855,0.0,2,radiant,China,Europe
836,4978587076,29,29,"[0, 322, 88, 443, 830, 1328, -166, -84, 536, 5...","[0, 260, 300, 755, 1208, 1447, 1095, 698, 1089...",39,EG,1838315,1838315,Secret,11958,0,4.0,3,dire,North America,Europe
1403,4970976863,8,30,"[0, -97, -129, 49, 92, 178, 423, 490, 874, 57,...","[0, -136, -319, -185, 131, 454, 381, 780, -15,...",7203342,CHAOS,2626685,2626685,KG,13274,0,0.0,0,radiant,Europe,China
541,4973348952,31,27,"[0, 348, -364, -672, -793, -166, -763, -371, -...","[0, 44, -271, -1000, -1085, -54, -1038, -812, ...",2586976,OG,726228,726228,VG,53052,0,0.0,3,radiant,Europe,China


In [3]:
# Check for nan values
ti.dropna().shape, ti.shape

((1940, 17), (1950, 17))

It seems that there are some NaN values. They are exactly 10 so I guess they have to do with a certain match. I will drop them.

In [4]:
ti = ti.dropna()

Total matches now:

In [5]:
match_ids = pd.unique(ti.match_id)
f"There are {len(match_ids)} matches in total."

'There are 194 matches in total.'

## Define Performance Score

Based on Rongzhi's slides.

I have separated each calculation in separate functions to make them more readable.

In [6]:
def _score_gap(match_id, df):
    # NOTE: This is not normalized (we will probably normalize the whole dataset later so it is okay).
    score = df[df.match_id==match_id][['radiant_score', 'dire_score']].values[0]
    return score[0] - score[1]  # will be negative if radiant lost. if they won, it will be positive

In [7]:
def _xp_gap(match_id, df, use_weights=True):
    # Take the sum of the xp advantages (over all minutes)
    # NOTE: This is probably not what Rongzhi mentioned in his slides but I couldn't understand what "10minutes xp advantage" meant
    xp_list = np.array(ast.literal_eval(pd.unique(df[df.match_id==match_id].radiant_xp_adv)[0]))
    # A negative sum would mean that the radiant team was on a disantage most of the times
    if use_weights:
        # IDEA: the last few minutes are more important and so we should give a greater weight to these
        weights = np.linspace(0.1, 1, num=len(xp_list))  # start from 0.1 in order to avoid zeros
        xp_list_weighted = weights * xp_list
        return np.sum(xp_list_weighted)
    return np.sum(xp_list)

In [8]:
def _gold_advantage(match_id, df, use_weights=True):  # Maybe we could use only the final gold advantage as Rongzhi said
    # Probably gold is more important than xp (especially in lengthy matches)
    gold_list = np.array(ast.literal_eval(pd.unique(df[df.match_id==match_id].radiant_gold_adv)[0]))
    # A negative sum would mean that the radiant team had less gold for most of the game
    if use_weights:
        weights = np.linspace(0.1, 1, num=len(gold_list))  # start from 0.1 in order to avoid zeros
        gold_list_weighted = weights * gold_list
        return np.sum(gold_list_weighted)
    return np.sum(xp_list)

In [9]:
# Idea: Hero damage and healing is specific to the type of player. For example if a player is a support then it is more
#       probable for them to heal than to have high damage. Maybe there is a way to take advantage of that?
def _hero_specific_scores(match_id, df):  # hero damage and hero healing
    match = df[df.match_id==match_id]
    radiant_stats = match[match.player_slot=='radiant']  # for each player (length of 5)
    dire_stats = match[match.player_slot=='dire']  # for each player (length of 5)
    assert radiant_stats.shape == dire_stats.shape, f"Match {match_id} consisted of uneven teams."  # in case of bad data
    radiant = {
        "damage": radiant_stats['hero_damage'].values, 
        "healing": radiant_stats['hero_healing'].values,
        "kda": radiant_stats['kda'].values,
        "wards": radiant_stats['obs_placed'].values,
    }  # will contain the damage and healing scores for the radiant players
    dire = {
        "damage": dire_stats['hero_damage'].values, 
        "healing": dire_stats['hero_healing'].values,
        "kda": dire_stats['kda'].values,
        "wards": dire_stats['obs_placed'].values,
    }
#     # Case 1: get the max hero damage and max hero healing
#     radiant_damage = np.max(radiant['damage'])
#     dire_damage = np.max(dire['damage'])
#     radiant_healing = np.max(radiant['healing'])
#     dire_healing = np.max(dire['healing'])
#     radiant_kda = np.max(radiant['kda'])
#     dire_kda = np.max(dire['kda'])
#     radiant_wards = np.max(radiant['wards'])
#     dire_wards = np.max(dire['wards'])
#     return (radiant_damage, radiant_healing, radiant_kda, radiant_wards,
#             dire_damage, dire_healing, dire_kda, dire_wards)
#     # Case 2: get the sum of hero damages and max hero healings (probably better than the above)
#     radiant_damage_score = np.sum(radiant['damage'])
#     dire_damage_score = np.sum(dire['damage'])
#     radiant_healing_score = np.sum(radiant['healing'])
#     dire_healing_score = np.sum(dire['healing'])
#     radiant_kda_score = np.sum(radiant['kda'])
#     dire_kda_score = np.sum(dire['kda'])
#     radiant_wards = np.sum(radiant['wards'])
#     dire_wards = np.sum(dire['wards'])
#     return (radiant_damage, radiant_healing, radiant_kda, radiant_wards,
#             dire_damage, dire_healing, dire_kda, dire_wards)
    # Case 3: Use the diffence of the sums for each team (maybe that's the best one)
    dmg_diff = np.sum(radiant['damage']) - np.sum(dire['damage'])
    heal_diff = np.sum(radiant['healing']) - np.sum(dire['healing'])
    kda_diff = np.sum(radiant['kda']) - np.sum(dire['kda'])
    wards_diff = np.sum(radiant['wards']) - np.sum(dire['wards'])
    return dmg_diff, heal_diff, kda_diff, wards_diff

In [10]:
def performance(match_id, df):
    score_gap = _score_gap(match_id, df)
    xp_gap = _xp_gap(match_id, df, use_weights=True)
    gold_adv = _gold_advantage(match_id, df, use_weights=True)
#     # case 1, 2 above
#     rad_dmg, rad_heal, rad_kda, rad_wards, dire_dmg, dire_heal, dire_kda, dire_wards = _hero_specific_score(match_id, df)
    dmg_diff, heal_diff, kda_diff, wards_diff = _hero_specific_scores(match_id, df)  # case 3
    return score_gap, xp_gap, gold_adv, dmg_diff, heal_diff, kda_diff, wards_diff

In [11]:
# Example
performance(4967996576, ti)

(-12, 95320.21666666667, 215900.69999999998, 2761, 5602, -13, 1.0)

### Create a DF for each match with their performance scores

I use case 3 from the `_hero_specific_score` method.

In [12]:
from collections import defaultdict

In [13]:
# Dictionary that will contain the values
cols = ["match_id", "score_gap", "xp_gap", "gold_adv", "dmg_diff", "heal_diff", "kda_diff", "wards_diff"]
matches = defaultdict(list)
matches

defaultdict(list, {})

In [14]:
for match_id in match_ids:
    # A tuple of length len(cols)-1  (does not have the 'match_id')
    perf = performance(match_id, ti)
    current_match_scores = []
    for i, col in enumerate(cols):
        if col=="match_id":
            matches['match_id'].append(match_id)
            continue
        item = perf[i-1]
        matches[col].append(perf[i-1])  # since i=1 belongs to "match_id"

In [15]:
matches = pd.DataFrame(matches)
matches.head()

Unnamed: 0,match_id,score_gap,xp_gap,gold_adv,dmg_diff,heal_diff,kda_diff,wards_diff
0,4978701632,-12,86164.553659,214749.407317,-6485,-404,-18,-1.0
1,4969568760,-6,-111669.4,-83131.978261,13043,-7342,-15,-1.0
2,4968466397,-18,-298842.237143,-216912.745714,-29772,985,-43,0.0
3,4967721543,-27,-195171.069231,-256060.138462,-47182,516,-22,1.0
4,4977018128,-4,-206268.45082,-111569.085246,-9859,-1526,-8,-1.0


In [16]:
matches.describe()

Unnamed: 0,match_id,score_gap,xp_gap,gold_adv,dmg_diff,heal_diff,kda_diff,wards_diff
count,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0
mean,4972879000.0,-1.036082,-24099.196501,-6378.724897,-679.252577,200.912371,-1.118557,0.185567
std,5204528.0,15.894853,191353.141124,187021.031575,26164.30435,8215.778786,30.001405,1.16361
min,4967601000.0,-32.0,-669004.773684,-809297.026316,-69312.0,-48491.0,-66.0,-3.0
25%,4969255000.0,-14.75,-155498.507659,-112926.633811,-20182.75,-3495.5,-23.0,-1.0
50%,4971152000.0,-3.0,-30954.473913,-6982.11375,-3049.0,157.0,-3.5,0.0
75%,4975930000.0,12.0,95263.29125,102489.604254,18507.5,4006.0,20.0,1.0
max,4986462000.0,33.0,466583.426415,556926.667308,67926.0,26370.0,71.0,3.0


## Scale scores

In [17]:
from sklearn.preprocessing import normalize

In [18]:
matches_norm = normalize(matches.drop(columns='match_id'), axis=0, norm='l2')
matches_norm = pd.DataFrame(matches_norm, columns=matches.columns[1:])
matches_norm['total_score'] = [matches_norm.iloc[i].sum() for i in range(len(matches_norm))]
matches_norm['match_id'] = matches['match_id']
matches_norm.sample(10)

Unnamed: 0,score_gap,xp_gap,gold_adv,dmg_diff,heal_diff,kda_diff,wards_diff,total_score,match_id
173,0.090379,0.068225,0.031247,0.077033,0.001305,0.112687,0.122169,0.503047,4969495247
67,-0.004519,-0.018216,0.014967,-0.001474,0.079994,-0.007193,-0.061085,0.002474,4973234250
186,-0.072304,-0.051172,-0.018723,-0.049176,0.008268,-0.086314,0.0,-0.26942,4968280282
182,-0.090379,-0.070124,-0.069307,-0.136322,0.010966,-0.071928,0.061085,-0.36601,4973059153
149,0.090379,0.024016,0.06892,0.120396,-2.6e-05,0.05994,0.0,0.363624,4971714902
193,-0.018076,-0.033345,-0.041736,-0.013283,-0.113654,-0.033566,0.0,-0.25366,4967956396
151,0.0,0.078729,0.027921,-0.000608,-0.038258,-0.002398,0.061085,0.126471,4969402562
76,0.04519,0.029999,0.111364,0.044476,-0.141734,0.014386,-0.061085,0.042596,4976922218
34,-0.08586,-0.0871,-0.056116,-0.010522,-0.130251,-0.095904,-0.061085,-0.526838,4978435281
88,-0.063266,-0.017557,0.029785,-0.05521,-0.030025,-0.035964,0.061085,-0.111153,4971343154


In [19]:
matches_norm.describe()

Unnamed: 0,score_gap,xp_gap,gold_adv,dmg_diff,heal_diff,kda_diff,wards_diff,total_score,match_id
count,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0
mean,-0.004682,-0.008994,-0.002454,-0.001868,0.00176,-0.002682,0.011335,-0.007585,4972879000.0
std,0.071828,0.071415,0.07194,0.071957,0.07196,0.071931,0.071079,0.371526,5204528.0
min,-0.144607,-0.249678,-0.311304,-0.190622,-0.424721,-0.158242,-0.183254,-0.810802,4967601000.0
25%,-0.066655,-0.058033,-0.043438,-0.055507,-0.030616,-0.055145,-0.061085,-0.322307,4969255000.0
50%,-0.013557,-0.011552,-0.002686,-0.008385,0.001375,-0.008392,0.0,-0.036887,4971152000.0
75%,0.054228,0.035553,0.039424,0.050899,0.035088,0.047952,0.061085,0.306414,4975930000.0
max,0.149126,0.174133,0.214227,0.18681,0.230968,0.17023,0.183254,0.7621,4986462000.0


## Save Data

In [20]:
matches.to_csv("./matches.csv", index=False)
matches_norm.to_csv("./matches_normalized.csv", index=False)