# Data Transformation

## Basic imports

In [24]:
import numpy as np
import pandas as pd

import ast

## Load Data

In [14]:
ti9_path = "../ti9_full.csv"
ti = pd.read_csv(ti9_path)
print(f"Shape of dataframe: {ti.shape}")
ti.sample(5)  # show 5 random entries

Shape of dataframe: (1950, 17)


Unnamed: 0,match_id,radiant_score,dire_score,radiant_xp_adv,radiant_gold_adv,rad_team_id,rad_tag,dire_team_id,dire_team_id.1,dire_tag,hero_damage,hero_healing,obs_placed,kda,player_slot,region_x,region_y
1466,4983800597,22,29,"[0, 191, 43, 279, -68, -191, 358, -222, 624, 1...","[0, -269, -436, 313, 26, 115, -223, -451, 172,...",2586976,OG,15,15,PSG.LGD,42573,0,0.0,5,dire,Europe,China
996,4967996576,23,35,"[0, 172, -85, -525, -836, -517, -915, -384, -9...","[0, 16, -436, -872, -1553, -1426, -1637, -1380...",6214538,Newbee,1838315,1838315,Secret,29819,5930,0.0,11,dire,North America,Europe
584,4973241356,52,27,"[0, 122, 191, 67, -394, 314, 792, 1544, 738, 1...","[0, 183, -67, 172, -94, 681, 1266, 2192, 1487,...",726228,VG,2586976,2586976,OG,7611,9527,13.0,3,radiant,China,Europe
84,4971031352,14,28,"[0, 19, 373, -248, -547, -817, -996, -1275, -1...","[0, -450, -159, -349, -591, -911, -1221, -1941...",2672298,Infamous,39,39,EG,23224,0,1.0,1,radiant,South America,North America
756,4971219223,24,12,"[0, -277, -256, -175, -642, -410, 277, 107, -3...","[0, -41, 24, -41, -335, -378, -560, -357, -672...",2163,Liquid,111474,111474,Alliance,5803,0,0.0,1,dire,Europe,Europe


In [13]:
match_ids = pd.unique(ti.match_id)
f"There are {len(match_ids)} matches in total."

'There are 195 matches in total.'

## Define Performance Score

Based on Rongzhi's slides.

I have separated each calculation in separate functions to make them more readable.

In [17]:
def _score_gap(match_id, df):
    # NOTE: This is not normalized (we will probably normalize the whole dataset later so it is okay).
    score = df[df.match_id==match_id][['radiant_score', 'dire_score']].values
    return score[0] - score[1]  # will be negative if radiant lost. if they won, it will be positive

In [26]:
def _xp_gap(match_id, df, use_weights=True):
    # Take the sum of the xp advantages (over all minutes)
    # NOTE: This is probably not what Rongzhi mentioned in his slides but I couldn't understand what "10minutes xp advantage" meant
    xp_list = np.array(ast.literal_eval(pd.unique(df[df.match_id==match_id].radiant_xp_adv)[0]))
    # A negative sum would mean that the radiant team was on a disantage most of the times
    if use_weights:
        # IDEA: the last few minutes are more important and so we should give a greater weight to these
        weights = np.linspace(0.1, 1, num=len(xp_list))  # start from 0.1 in order to avoid zeros
        xp_list_weighted = weights * xp_list
        return np.sum(xp_list_weighted)
    return np.sum(xp_list)

In [28]:
def _gold_advantage(match_id, df, use_weights=True):  # Maybe we could use only the final gold advantage as Rongzhi said
    # Probably gold is more important than xp (especially in lengthy matches)
    gold_list = np.array(ast.literal_eval(pd.unique(df[df.match_id==match_id].radiant_gold_adv)[0]))
    # A negative sum would mean that the radiant team had less gold for most of the game
    if use_weights:
        weights = np.linspace(0.1, 1, num=len(gold_list))  # start from 0.1 in order to avoid zeros
        gold_list_weighted = weights * gold_list
        return np.sum(gold_list_weighted)
    return np.sum(xp_list)

In [None]:
# Idea: Hero damage and healing is specific to the type of player. For example if a player is a support then it is more
#       probable for them to heal than to have high damage. Maybe there is a way to take advantage of that?
def _hero_specific_scores(match_id, df):  # hero damage and hero healing
    match = df[df.match_id==match_id]
    radiant_stats = match[match.player_slot=='radiant']  # for each player (length of 5)
    dire_stats = match[match.player_slot=='dire']  # for each player (length of 5)
    assert radiant_stats.shape == dire_stats.shape, f"Match {match_id} consisted of uneven teams."  # in case of bad data
    radiant = {
        "damage": radiant_stats['hero_damage'].values, 
        "healing": radiant_stats['hero_healing'].values,
        "kda": radiant_stats['kda'].values,
        "wards": radiant_stats['obs_placed'].values,
    }  # will contain the damage and healing scores for the radiant players
    dire = {
        "damage": dire_stats['hero_damage'].values, 
        "healing": dire_stats['hero_healing'].values,
        "kda": dire_stats['kda'].values,
        "wards": dire_stats['obs_placed'].values,
    }
#     # Case 1: get the max hero damage and max hero healing
#     radiant_damage = np.max(radiant['damage'])
#     dire_damage = np.max(dire['damage'])
#     radiant_healing = np.max(radiant['healing'])
#     dire_healing = np.max(dire['healing'])
#     radiant_kda = np.max(radiant['kda'])
#     dire_kda = np.max(dire['kda'])
#     radiant_wards = np.max(radiant['wards'])
#     dire_wards = np.max(dire['wards'])
#     return (radiant_damage, radiant_healing, radiant_kda, radiant_wards,
#             dire_damage, dire_healing, dire_kda, dire_wards)
#     # Case 2: get the sum of hero damages and max hero healings (probably better than the above)
#     radiant_damage_score = np.sum(radiant['damage'])
#     dire_damage_score = np.sum(dire['damage'])
#     radiant_healing_score = np.sum(radiant['healing'])
#     dire_healing_score = np.sum(dire['healing'])
#     radiant_kda_score = np.sum(radiant['kda'])
#     dire_kda_score = np.sum(dire['kda'])
#     radiant_wards = np.sum(radiant['wards'])
#     dire_wards = np.sum(dire['wards'])
#     return (radiant_damage, radiant_healing, radiant_kda, radiant_wards,
#             dire_damage, dire_healing, dire_kda, dire_wards)
    # Case 3: Use the diffence of the sums for each team (maybe that's the best one)
    dmg_diff = np.sum(radiant['damage']) - np.sum(dire['damage'])
    heal_diff = np.sum(radiant['healing']) - np.sum(dire['healing'])
    kda_diff = np.sum(radiant['kda']) - np.sum(dire['kda'])
    wards_diff = np.sum(radiant['wards']) - np.sum(dire['wards'])
    return dmg_diff, heal_diff, kda_diff, wards_diff

In [None]:
def performance(match_id, df):
    score_gap = _score_gap(match_id, df)
    xp_gap = _xp_gap(match_id, df, use_weights=True)
    gold_adv = _gold_advantage(match_id, df, use_weights=True)
#     # case 1, 2 above
#     rad_dmg, rad_heal, rad_kda, rad_wards, dire_dmg, dire_heal, dire_kda, dire_wards = _hero_specific_score(match_id, df)
    dmg_diff, heal_diff, kda_diff, wards_diff = _hero_specific_score(match_id, df)  # case 3
    return score_gap, xp_gap, gold_adv, dmg_diff, heal_diff, kda_diff, wards_diff

### Create a DF for each match with their performance scores

I use case 3 from the `_hero_specific_score` method.

In [32]:
# Dictionary that will contain the values
cols = ["match_id", "score_gap", "xp_gap", "gold_adv", "dmg_diff", "heal_diff", "kda_diff", "wards_diff"]
matches = dict.fromkeys(cols, [])
matches

{'match_id': [],
 'score_gap': [],
 'xp_gap': [],
 'gold_adv': [],
 'dmg_diff': [],
 'heal_diff': [],
 'kda_diff': [],
 'wards_diff': []}

In [None]:
for match_id in match_ids:
    # A tuple of length len(cols)-1  (does not have the 'match_id')
    perf = performance(match_id, ti)
    for i, col in enumerate(cols):
        if col=="match_id":
            matches['match_id'].append(match_id)
            continue
        matches[col].append(perf[i-1])  # since i=1 belongs to "match_id"