In [2]:
import ujson as json
from tqdm import tqdm_notebook
import os #to access files
from scipy.sparse import hstack
import time
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler

In [4]:
PATH_TO_DATA = '../data/'

In [5]:
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            'train_targets.csv'), 
                                   index_col='match_id_hash')

In [6]:
y = df_train_targets['radiant_win'].values

In [7]:
def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)

In [8]:
json_list = [] #store data that are read
number_of_rows = 50 #how many lines to read 

#reading data from .jsonl file
with open(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')) as fin:
    for i in range(number_of_rows):
        line = fin.readline()
        json_list.append(json.loads(line))
        
#how many matches to read. For example I took 1
for i in range(1, 2):
  for j in range(1, 2):#there is 5 players in each team. But I want to look on only one player.
    print(json.dumps(json_list[i]['players'][j], indent=4, sort_keys=True))

{
    "ability_upgrades":[
        {
            "ability":5514,
            "level":1,
            "time":232
        },
        {
            "ability":5515,
            "level":2,
            "time":304
        },
        {
            "ability":5515,
            "level":3,
            "time":322
        },
        {
            "ability":5514,
            "level":4,
            "time":391
        },
        {
            "ability":5515,
            "level":5,
            "time":499
        },
        {
            "ability":5517,
            "level":6,
            "time":581
        }
    ],
    "ability_uses":{
        "centaur_double_edge":11,
        "centaur_hoof_stomp":8,
        "centaur_stampede":2
    },
    "account_id_hash":"1c2500195e6ab646fc9371344724aa77",
    "actions":{
        "1":3679,
        "10":183,
        "11":8,
        "15":19,
        "16":18,
        "19":14,
        "2":2,
        "33":183,
        "4":810,
        "5":4,
        "6":42,
        "7":12,


}


In [9]:
for i in range(1, 5): #now we will look at 4 matches
  for j in range(1, 5):#and now will take 5 players
    print(json.dumps(list(map(lambda x: [x][0], json_list[i]['players'][j]['damage'])), indent=4, sort_keys=True))

[
    "npc_dota_creep_goodguys_ranged",
    "npc_dota_hero_antimage",
    "npc_dota_creep_goodguys_melee",
    "npc_dota_creep_badguys_melee",
    "npc_dota_creep_badguys_ranged",
    "npc_dota_hero_centaur",
    "npc_dota_neutral_wildkin",
    "npc_dota_neutral_satyr_hellcaller",
    "npc_dota_neutral_enraged_wildkin",
    "npc_dota_neutral_centaur_outrunner",
    "npc_dota_neutral_centaur_khan",
    "npc_dota_hero_enchantress",
    "npc_dota_badguys_siege",
    "npc_dota_hero_clinkz"
]
[
    "npc_dota_hero_clinkz",
    "npc_dota_hero_pudge",
    "npc_dota_creep_badguys_melee",
    "npc_dota_neutral_gnoll_assassin",
    "npc_dota_neutral_enraged_wildkin",
    "npc_dota_creep_goodguys_melee",
    "npc_dota_creep_goodguys_ranged",
    "npc_dota_hero_antimage",
    "npc_dota_hero_enchantress",
    "npc_dota_neutral_centaur_khan",
    "npc_dota_creep_badguys_ranged",
    "npc_dota_badguys_tower1_mid",
    "npc_dota_badguys_tower2_mid",
    "npc_dota_badguys_siege"
]
[
    "npc_dota_creep_

In [10]:
for i in range(1, 5): #now we will look at 4 matches
  for j in range(1, 5):#and now will take 5 players
    print(json.dumps(list(map(lambda x: x, json_list[i]['players'][j]['damage'])), indent=4, sort_keys=True))

TypeError: string indices must be integers

In [9]:
import collections


def extract_features_csv(match):
    
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]

    for slot, player in enumerate(match['players']):
        if slot < 5:
            player_name = 'r%d' % (slot + 1)
        else:
            player_name = 'd%d' % (slot - 4)

        row.append( (f'{player_name}_items', list(map(lambda x: x['id'][5:], player['hero_inventory'])) ) )
       # row.append( (f'{player_name}_items', list(map(lambda x: x['id'][5:], player[''])) ) )
        #here u can extract other data

    return collections.OrderedDict(row)

    
def extract_targets_csv(match, targets):
    return collections.OrderedDict([('match_id_hash', match['match_id_hash'])] + [
        (field, targets[field])
        for field in ['game_time', 'radiant_win', 'duration', 'time_remaining', 'next_roshan_team']
    ])

In [10]:
def create_features_from_jsonl(matches_file):
  
    df_new_features = []

    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']
        features = extract_features_csv(match)

        df_new_features.append(features)

    df_new_features = pd.DataFrame.from_records(df_new_features).set_index('match_id_hash')
    return df_new_features

In [11]:
train_df = create_features_from_jsonl(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')).fillna(0)

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))




In [12]:
test_df = create_features_from_jsonl(os.path.join(PATH_TO_DATA, 'test_matches.jsonl')).fillna(0)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [13]:
import pickle as pkl

#Better to save extracted data in files, because extracting takes time...
train_df.to_pickle('df_train.pkl')
test_df.to_pickle('df_test.pkl')

In [14]:
def add_items_dummies(train_df, test_df):
    
    full_df = pd.concat([train_df, test_df], sort=False)
    train_size = train_df.shape[0]

    for team in 'r', 'd':
        players = [f'{team}{i}' for i in range(1, 6)]
        item_columns = [f'{player}_items' for player in players]

        d = pd.get_dummies(full_df[item_columns[0]].apply(pd.Series).stack()).sum(level=0, axis=0)
        dindexes = d.index.values

        for c in item_columns[1:]:
            d = d.add(pd.get_dummies(full_df[c].apply(pd.Series).stack()).sum(level=0, axis=0), fill_value=0)
            d = d.ix[dindexes]

        full_df = pd.concat([full_df, d.add_prefix(f'{team}_item_')], axis=1, sort=False)
        full_df.drop(columns=item_columns, inplace=True)

    train_df = full_df.iloc[:train_size, :]
    test_df = full_df.iloc[train_size:, :]

    return train_df, test_df

In [15]:
def drop_consumble_items(train_df, test_df):
    
    full_df = pd.concat([train_df, test_df], sort=False)
    train_size = train_df.shape[0]

    for team in 'r', 'd':
        consumble_columns = ['tango', 'tpscroll', 
                             'bottle', 'flask',
                            'enchanted_mango', 'clarity',
                            'faerie_fire', 'ward_observer',
                            'ward_sentry']
        
        starts_with = f'{team}_item_'
        consumble_columns = [starts_with + column for column in consumble_columns]
        full_df.drop(columns=consumble_columns, inplace=True)

    train_df = full_df.iloc[:train_size, :]
    test_df = full_df.iloc[train_size:, :]

    return train_df, test_df

In [16]:
%%time
new_train = pd.read_pickle('df_train.pkl')
new_test = pd.read_pickle('df_test.pkl')

new_train, new_test = add_items_dummies(new_train, new_test)
new_train, new_test = drop_consumble_items(new_train, new_test)

target = pd.DataFrame(y)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


Wall time: 1min 31s


In [19]:
new_test.head()

Unnamed: 0,r_item_abyssal_blade,r_item_aegis,r_item_aeon_disk,r_item_aether_lens,r_item_ancient_janggo,r_item_arcane_boots,r_item_armlet,r_item_assault,r_item_basher,r_item_belt_of_strength,...,d_item_urn_of_shadows,d_item_vanguard,d_item_veil_of_discord,d_item_vitality_booster,d_item_vladmir,d_item_void_stone,d_item_ward_dispenser,d_item_wind_lace,d_item_wraith_band,d_item_yasha
30cc2d778dca82f2edb568ce9b585caa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
70e5ba30f367cea48793b9003fab9d38,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4d9ef74d3a2025d79e9423105fd73d41,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2bb79e0c1eaac1608e5a09c8e0c6a555,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bec17f099b01d67edc82dfb5ce735a43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0


In [20]:
new_test['match_id_hash']=new_test.index

In [21]:
new_test.head()

Unnamed: 0,r_item_abyssal_blade,r_item_aegis,r_item_aeon_disk,r_item_aether_lens,r_item_ancient_janggo,r_item_arcane_boots,r_item_armlet,r_item_assault,r_item_basher,r_item_belt_of_strength,...,d_item_vanguard,d_item_veil_of_discord,d_item_vitality_booster,d_item_vladmir,d_item_void_stone,d_item_ward_dispenser,d_item_wind_lace,d_item_wraith_band,d_item_yasha,match_id_hash
30cc2d778dca82f2edb568ce9b585caa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,30cc2d778dca82f2edb568ce9b585caa
70e5ba30f367cea48793b9003fab9d38,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,70e5ba30f367cea48793b9003fab9d38
4d9ef74d3a2025d79e9423105fd73d41,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4d9ef74d3a2025d79e9423105fd73d41
2bb79e0c1eaac1608e5a09c8e0c6a555,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2bb79e0c1eaac1608e5a09c8e0c6a555
bec17f099b01d67edc82dfb5ce735a43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,bec17f099b01d67edc82dfb5ce735a43


In [22]:
new_test = new_test.reset_index(drop=True)

In [23]:
new_test = new_test.set_index('match_id_hash')

In [24]:
#new_test.to_csv('test_items.csv')

In [25]:
new_train.head()

Unnamed: 0,r_item_abyssal_blade,r_item_aegis,r_item_aeon_disk,r_item_aether_lens,r_item_ancient_janggo,r_item_arcane_boots,r_item_armlet,r_item_assault,r_item_basher,r_item_belt_of_strength,...,d_item_urn_of_shadows,d_item_vanguard,d_item_veil_of_discord,d_item_vitality_booster,d_item_vladmir,d_item_void_stone,d_item_ward_dispenser,d_item_wind_lace,d_item_wraith_band,d_item_yasha
a400b8f29dece5f4d266f49f1ae2e98a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b9c57c450ce74a2af79c9ce96fac144d,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6db558535151ea18ca70a6892197db41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
46a0ddce8f7ed2a8d9bd5edcbb925682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
b1b35ff97723d9b7ade1c9c3cf48f770,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
new_train['match_id_hash']=new_train.index

In [27]:
new_train = new_train.set_index('match_id_hash')

In [28]:
new_train.head()

Unnamed: 0_level_0,r_item_abyssal_blade,r_item_aegis,r_item_aeon_disk,r_item_aether_lens,r_item_ancient_janggo,r_item_arcane_boots,r_item_armlet,r_item_assault,r_item_basher,r_item_belt_of_strength,...,d_item_urn_of_shadows,d_item_vanguard,d_item_veil_of_discord,d_item_vitality_booster,d_item_vladmir,d_item_void_stone,d_item_ward_dispenser,d_item_wind_lace,d_item_wraith_band,d_item_yasha
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b9c57c450ce74a2af79c9ce96fac144d,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6db558535151ea18ca70a6892197db41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
46a0ddce8f7ed2a8d9bd5edcbb925682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
b1b35ff97723d9b7ade1c9c3cf48f770,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
#new_train.to_csv('train_items.csv')

In [30]:
def add_new_features(df_features, matches_file):
    
    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']

        # Counting ruined towers for both teams
        radiant_tower_kills = 0
        dire_tower_kills = 0
        for objective in match['objectives']:
            if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
                if objective['team'] == 2:
                    radiant_tower_kills += 1
                if objective['team'] == 3:
                    dire_tower_kills += 1

        # Write new features
        df_features.loc[match_id_hash, 'radiant_tower_kills'] = radiant_tower_kills
        df_features.loc[match_id_hash, 'dire_tower_kills'] = dire_tower_kills
        df_features.loc[match_id_hash, 'diff_tower_kills'] = radiant_tower_kills - dire_tower_kills
        
        # Total damage
        total_damage = 0
        for i in range(1, 6):
            for j in match['players'][i-1]['damage']:
                # Take damage only to hero(not for creeps)
                if j.startswith('npc_dota_hero'):
                    total_damage += match['players'][i-1]['damage'][j]
        df_features.loc[match_id_hash, 'r_damage'] = total_damage
        total_damage = 0
        for i in range(6, 11):
            for j in match['players'][i-1]['damage']:
                if j.startswith('npc_dota_hero'):
                    total_damage += match['players'][i-1]['damage'][j]
        df_features.loc[match_id_hash, 'd_damage'] = total_damage

        df_features.loc[match_id_hash, 'diff_damage'] = df_features.loc[match_id_hash, 'r_damage'] - df_features.loc[match_id_hash, 'd_damage'] 

In [35]:
%%time
# copy the dataframe with features
df_train_features_extended = new_train.copy()

# add new features
add_new_features(df_train_features_extended, os.path.join(PATH_TO_DATA, 'train_matches.jsonl'))

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))

Wall time: 1min 13s


In [36]:
df_train_features_extended.head()

Unnamed: 0_level_0,r_item_abyssal_blade,r_item_aegis,r_item_aeon_disk,r_item_aether_lens,r_item_ancient_janggo,r_item_arcane_boots,r_item_armlet,r_item_assault,r_item_basher,r_item_belt_of_strength,...,d_item_ward_dispenser,d_item_wind_lace,d_item_wraith_band,d_item_yasha,radiant_tower_kills,dire_tower_kills,diff_tower_kills,r_damage,d_damage,diff_damage
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,948.0,2949.0,-2001.0
b9c57c450ce74a2af79c9ce96fac144d,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,2.0,0.0,2.0,17463.0,13063.0,4400.0
6db558535151ea18ca70a6892197db41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,70.0,68.0,2.0
46a0ddce8f7ed2a8d9bd5edcbb925682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15542.0,8337.0,7205.0
b1b35ff97723d9b7ade1c9c3cf48f770,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5940.0,11951.0,-6011.0


In [37]:
df_test_features_extended = new_test.copy()

# add new features
add_new_features(df_test_features_extended, os.path.join(PATH_TO_DATA, 'test_matches.jsonl'))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

In [38]:
df_test_features_extended.head()

Unnamed: 0_level_0,r_item_abyssal_blade,r_item_aegis,r_item_aeon_disk,r_item_aether_lens,r_item_ancient_janggo,r_item_arcane_boots,r_item_armlet,r_item_assault,r_item_basher,r_item_belt_of_strength,...,d_item_ward_dispenser,d_item_wind_lace,d_item_wraith_band,d_item_yasha,radiant_tower_kills,dire_tower_kills,diff_tower_kills,r_damage,d_damage,diff_damage
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30cc2d778dca82f2edb568ce9b585caa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,719.0,281.0,438.0
70e5ba30f367cea48793b9003fab9d38,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,7.0,2.0,5.0,39019.0,34237.0,4782.0
4d9ef74d3a2025d79e9423105fd73d41,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,1.0,3.0,36831.0,27318.0,9513.0
2bb79e0c1eaac1608e5a09c8e0c6a555,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15281.0,16531.0,-1250.0
bec17f099b01d67edc82dfb5ce735a43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,2.0,4.0,-2.0,46854.0,57564.0,-10710.0


In [39]:
df_train_features_extended.shape, df_test_features_extended.shape

((39675, 390), (10000, 390))

In [41]:
new_features = pd.concat([df_train_features_extended, df_test_features_extended])

In [42]:
new_features.shape

(49675, 390)

In [44]:
new_features.to_csv('new_features.csv')