In [18]:
import ujson as json
from tqdm import tqdm_notebook
import os #to access files
from scipy.sparse import hstack
import time
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler

In [19]:
PATH_TO_DATA = '../data/'

In [20]:
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            'train_targets.csv'), 
                                   index_col='match_id_hash')

In [21]:
y = df_train_targets['radiant_win'].values

In [22]:
def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)

In [23]:
json_list = [] #store data that are read
number_of_rows = 50 #how many lines to read 

#reading data from .jsonl file
with open(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')) as fin:
    for i in range(number_of_rows):
        line = fin.readline()
        json_list.append(json.loads(line))
        
#how many matches to read. For example I took 1
for i in range(1, 2):
  for j in range(1, 2):#there is 5 players in each team. But I want to look on only one player.
    print(json.dumps(json_list[i]['players'][j], indent=4, sort_keys=True))

{
    "ability_upgrades":[
        {
            "ability":5514,
            "level":1,
            "time":232
        },
        {
            "ability":5515,
            "level":2,
            "time":304
        },
        {
            "ability":5515,
            "level":3,
            "time":322
        },
        {
            "ability":5514,
            "level":4,
            "time":391
        },
        {
            "ability":5515,
            "level":5,
            "time":499
        },
        {
            "ability":5517,
            "level":6,
            "time":581
        }
    ],
    "ability_uses":{
        "centaur_double_edge":11,
        "centaur_hoof_stomp":8,
        "centaur_stampede":2
    },
    "account_id_hash":"1c2500195e6ab646fc9371344724aa77",
    "actions":{
        "1":3679,
        "10":183,
        "11":8,
        "15":19,
        "16":18,
        "19":14,
        "2":2,
        "33":183,
        "4":810,
        "5":4,
        "6":42,
        "7":12,


}


In [24]:
for i in range(1, 5): #now we will look at 4 matches
  for j in range(1, 5):#and now will take 5 players
    print(json.dumps(list(map(lambda x: x, json_list[i]['players'][j])), indent=4, sort_keys=True))

[
    "player_slot",
    "hero_id",
    "hero_name",
    "account_id_hash",
    "ability_upgrades",
    "obs_placed",
    "sen_placed",
    "creeps_stacked",
    "camps_stacked",
    "rune_pickups",
    "firstblood_claimed",
    "teamfight_participation",
    "towers_killed",
    "roshans_killed",
    "observers_placed",
    "stuns",
    "max_hero_hit",
    "times",
    "gold_t",
    "lh_t",
    "dn_t",
    "xp_t",
    "obs_log",
    "sen_log",
    "obs_left_log",
    "sen_left_log",
    "purchase_log",
    "kills_log",
    "buyback_log",
    "runes_log",
    "obs",
    "sen",
    "actions",
    "pings",
    "purchase",
    "gold_reasons",
    "xp_reasons",
    "killed",
    "item_uses",
    "ability_uses",
    "hero_hits",
    "damage",
    "damage_taken",
    "damage_inflictor",
    "runes",
    "killed_by",
    "kill_streaks",
    "multi_kills",
    "life_state",
    "healing",
    "damage_inflictor_received",
    "randomed",
    "pred_vict",
    "gold",
    "lh",
    "xp",
    "x",

]
[
    "player_slot",
    "hero_id",
    "hero_name",
    "account_id_hash",
    "ability_upgrades",
    "obs_placed",
    "sen_placed",
    "creeps_stacked",
    "camps_stacked",
    "rune_pickups",
    "firstblood_claimed",
    "teamfight_participation",
    "towers_killed",
    "roshans_killed",
    "observers_placed",
    "stuns",
    "max_hero_hit",
    "times",
    "gold_t",
    "lh_t",
    "dn_t",
    "xp_t",
    "obs_log",
    "sen_log",
    "obs_left_log",
    "sen_left_log",
    "purchase_log",
    "kills_log",
    "buyback_log",
    "runes_log",
    "obs",
    "sen",
    "actions",
    "pings",
    "purchase",
    "gold_reasons",
    "xp_reasons",
    "killed",
    "item_uses",
    "ability_uses",
    "hero_hits",
    "damage",
    "damage_taken",
    "damage_inflictor",
    "runes",
    "killed_by",
    "kill_streaks",
    "multi_kills",
    "life_state",
    "healing",
    "damage_inflictor_received",
    "randomed",
    "pred_vict",
    "gold",
    "lh",
    "xp",
    "x

]
[
    "player_slot",
    "hero_id",
    "hero_name",
    "account_id_hash",
    "ability_upgrades",
    "obs_placed",
    "sen_placed",
    "creeps_stacked",
    "camps_stacked",
    "rune_pickups",
    "firstblood_claimed",
    "teamfight_participation",
    "towers_killed",
    "roshans_killed",
    "observers_placed",
    "stuns",
    "max_hero_hit",
    "times",
    "gold_t",
    "lh_t",
    "dn_t",
    "xp_t",
    "obs_log",
    "sen_log",
    "obs_left_log",
    "sen_left_log",
    "purchase_log",
    "kills_log",
    "buyback_log",
    "runes_log",
    "obs",
    "sen",
    "actions",
    "pings",
    "purchase",
    "gold_reasons",
    "xp_reasons",
    "killed",
    "item_uses",
    "ability_uses",
    "hero_hits",
    "damage",
    "damage_taken",
    "damage_inflictor",
    "runes",
    "killed_by",
    "kill_streaks",
    "multi_kills",
    "life_state",
    "healing",
    "damage_inflictor_received",
    "randomed",
    "pred_vict",
    "gold",
    "lh",
    "xp",
    "x

In [25]:
for i in range(1, 5): #now we will look at 4 matches
  for j in range(1, 5):#and now will take 5 players
    print(json.dumps(list(map(lambda x: x, json_list[i]['players'][j]['damage'])), indent=4, sort_keys=True))

[
    "npc_dota_creep_goodguys_ranged",
    "npc_dota_hero_antimage",
    "npc_dota_creep_goodguys_melee",
    "npc_dota_creep_badguys_melee",
    "npc_dota_creep_badguys_ranged",
    "npc_dota_hero_centaur",
    "npc_dota_neutral_wildkin",
    "npc_dota_neutral_satyr_hellcaller",
    "npc_dota_neutral_enraged_wildkin",
    "npc_dota_neutral_centaur_outrunner",
    "npc_dota_neutral_centaur_khan",
    "npc_dota_hero_enchantress",
    "npc_dota_badguys_siege",
    "npc_dota_hero_clinkz"
]
[
    "npc_dota_hero_clinkz",
    "npc_dota_hero_pudge",
    "npc_dota_creep_badguys_melee",
    "npc_dota_neutral_gnoll_assassin",
    "npc_dota_neutral_enraged_wildkin",
    "npc_dota_creep_goodguys_melee",
    "npc_dota_creep_goodguys_ranged",
    "npc_dota_hero_antimage",
    "npc_dota_hero_enchantress",
    "npc_dota_neutral_centaur_khan",
    "npc_dota_creep_badguys_ranged",
    "npc_dota_badguys_tower1_mid",
    "npc_dota_badguys_tower2_mid",
    "npc_dota_badguys_siege"
]
[
    "npc_dota_creep_

In [26]:
import collections


def extract_features_csv(match):
    
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]

    for slot, player in enumerate(match['players']):
        if slot < 5:
            player_name = 'r%d' % (slot + 1)
        else:
            player_name = 'd%d' % (slot - 4)

        row.append( (f'{player_name}_items', list(map(lambda x: x['id'][5:], player['hero_inventory'])) ) )
       # row.append( (f'{player_name}_items', list(map(lambda x: x['id'][5:], player[''])) ) )
        #here u can extract other data

    return collections.OrderedDict(row)

    
def extract_targets_csv(match, targets):
    return collections.OrderedDict([('match_id_hash', match['match_id_hash'])] + [
        (field, targets[field])
        for field in ['game_time', 'radiant_win', 'duration', 'time_remaining', 'next_roshan_team']
    ])

In [27]:
def create_features_from_jsonl(matches_file):
  
    df_new_features = []

    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']
        features = extract_features_csv(match)

        df_new_features.append(features)

    df_new_features = pd.DataFrame.from_records(df_new_features).set_index('match_id_hash')
    return df_new_features

In [28]:
train_df = create_features_from_jsonl(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')).fillna(0)

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))




In [29]:
test_df = create_features_from_jsonl(os.path.join(PATH_TO_DATA, 'test_matches.jsonl')).fillna(0)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [30]:
import pickle as pkl

#Better to save extracted data in files, because extracting takes time...
train_df.to_pickle('df_train.pkl')
test_df.to_pickle('df_test.pkl')

In [31]:
def add_items_dummies(train_df, test_df):
    
    full_df = pd.concat([train_df, test_df], sort=False)
    train_size = train_df.shape[0]

    for team in 'r', 'd':
        players = [f'{team}{i}' for i in range(1, 6)]
        item_columns = [f'{player}_items' for player in players]

        d = pd.get_dummies(full_df[item_columns[0]].apply(pd.Series).stack()).sum(level=0, axis=0)
        dindexes = d.index.values

        for c in item_columns[1:]:
            d = d.add(pd.get_dummies(full_df[c].apply(pd.Series).stack()).sum(level=0, axis=0), fill_value=0)
            d = d.ix[dindexes]

        full_df = pd.concat([full_df, d.add_prefix(f'{team}_item_')], axis=1, sort=False)
        full_df.drop(columns=item_columns, inplace=True)

    train_df = full_df.iloc[:train_size, :]
    test_df = full_df.iloc[train_size:, :]

    return train_df, test_df

In [32]:
def drop_consumble_items(train_df, test_df):
    
    full_df = pd.concat([train_df, test_df], sort=False)
    train_size = train_df.shape[0]

    for team in 'r', 'd':
        consumble_columns = ['tango', 'tpscroll', 
                             'bottle', 'flask',
                            'enchanted_mango', 'clarity',
                            'faerie_fire', 'ward_observer',
                            'ward_sentry']
        
        starts_with = f'{team}_item_'
        consumble_columns = [starts_with + column for column in consumble_columns]
        full_df.drop(columns=consumble_columns, inplace=True)

    train_df = full_df.iloc[:train_size, :]
    test_df = full_df.iloc[train_size:, :]

    return train_df, test_df

In [None]:
%%time
new_train = pd.read_pickle('df_train.pkl')
new_test = pd.read_pickle('df_test.pkl')

new_train, new_test = add_items_dummies(new_train, new_test)
new_train, new_test = drop_consumble_items(new_train, new_test)

target = pd.DataFrame(y)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


In [None]:
new_test.head()

In [None]:
new_test['match_id_hash']=new_test.index

In [None]:
new_test.head()

In [None]:
new_test = new_test.reset_index(drop=True)

In [None]:
new_test = new_test.set_index('match_id_hash')

In [None]:
#new_test.to_csv('test_items.csv')

In [None]:
new_train.head()

In [None]:
new_train['match_id_hash']=new_train.index

In [None]:
new_train = new_train.set_index('match_id_hash')

In [None]:
new_train.head()

In [None]:
#new_train.to_csv('train_items.csv')

In [None]:
def add_new_features(df_features, matches_file):
    
    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']

        # Counting ruined towers for both teams
        radiant_tower_kills = 0
        dire_tower_kills = 0
        for objective in match['objectives']:
            if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
                if objective['team'] == 2:
                    radiant_tower_kills += 1
                if objective['team'] == 3:
                    dire_tower_kills += 1

        # Write new features
        df_features.loc[match_id_hash, 'radiant_tower_kills'] = radiant_tower_kills
        df_features.loc[match_id_hash, 'dire_tower_kills'] = dire_tower_kills
        df_features.loc[match_id_hash, 'diff_tower_kills'] = radiant_tower_kills - dire_tower_kills
        
        # Total damage
        total_damage = 0
        for i in range(1, 6):
            for j in match['players'][i-1]['damage_inflictor']:
                # Take damage only to hero(not for creeps)
                if j.startswith('npc_dota_hero'):
                    total_damage += match['players'][i-1]['damage_inflictor'][j]
        df_features.loc[match_id_hash, 'r_damage_inflictor'] = total_damage
        total_damage = 0
        for i in range(6, 11):
            for j in match['players'][i-1]['damage_inflictor']:
                if j.startswith('npc_dota_hero'):
                    total_damage += match['players'][i-1]['damage_inflictor'][j]
        df_features.loc[match_id_hash, 'd_damage_inflictor'] = total_damage

        df_features.loc[match_id_hash, 'diff_damage_inflictor'] = df_features.loc[match_id_hash, 'r_damage_inflictor'] - df_features.loc[match_id_hash, 'd_damage_inflictor'] 

In [None]:
%%time
# copy the dataframe with features
df_train_features_extended = new_train.copy()

# add new features
add_new_features(df_train_features_extended, os.path.join(PATH_TO_DATA, 'train_matches.jsonl'))

In [None]:
df_train_features_extended.head()

In [None]:
df_test_features_extended = new_test.copy()

# add new features
add_new_features(df_test_features_extended, os.path.join(PATH_TO_DATA, 'test_matches.jsonl'))

In [None]:
df_test_features_extended.head()

In [None]:
df_train_features_extended.shape, df_test_features_extended.shape

In [None]:
new_features = pd.concat([df_train_features_extended, df_test_features_extended])

In [None]:
new_features.shape

In [44]:
#new_features.to_csv('new_features.csv')