In [1]:
import ujson as json
from tqdm import tqdm_notebook
import os #to access files
from scipy.sparse import hstack
import time
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler

In [2]:
PATH_TO_DATA = 'data/'

In [3]:
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            'train_targets.csv'), 
                                   index_col='match_id_hash')

In [4]:
y = df_train_targets['radiant_win'].values

In [5]:
def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)

In [6]:
json_list = [] #store data that are read
number_of_rows = 50 #how many lines to read 

#reading data from .jsonl file
with open(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')) as fin:
    for i in range(number_of_rows):
        line = fin.readline()
        json_list.append(json.loads(line))
        
#how many matches to read. For example I took 1
for i in range(1, 2):
  for j in range(1, 2):#there is 5 players in each team. But I want to look on only one player.
    print(json.dumps(json_list[i]['players'][j], indent=4, sort_keys=True))

{
    "ability_upgrades":[
        {
            "ability":5514,
            "level":1,
            "time":232
        },
        {
            "ability":5515,
            "level":2,
            "time":304
        },
        {
            "ability":5515,
            "level":3,
            "time":322
        },
        {
            "ability":5514,
            "level":4,
            "time":391
        },
        {
            "ability":5515,
            "level":5,
            "time":499
        },
        {
            "ability":5517,
            "level":6,
            "time":581
        }
    ],
    "ability_uses":{
        "centaur_double_edge":11,
        "centaur_hoof_stomp":8,
        "centaur_stampede":2
    },
    "account_id_hash":"1c2500195e6ab646fc9371344724aa77",
    "actions":{
        "1":3679,
        "10":183,
        "11":8,
        "15":19,
        "16":18,
        "19":14,
        "2":2,
        "33":183,
        "4":810,
        "5":4,
        "6":42,
        "7":12,


}


In [7]:
for i in range(1, 5): #now we will look at 4 matches
  for j in range(1, 5):#and now will take 5 players
    print(json.dumps(list(map(lambda x: x['id'][5:], json_list[i]['players'][j]['hero_inventory'])), indent=4, sort_keys=True))

[
    "stout_shield",
    "soul_ring",
    "tranquil_boots",
    "chainmail"
]
[
    "magic_wand",
    "dust",
    "boots",
    "ward_dispenser",
    "tpscroll"
]
[
    "magic_wand",
    "pers",
    "blight_stone",
    "tpscroll",
    "ring_of_aquila"
]
[
    "boots",
    "stout_shield",
    "tpscroll"
]
[
    "tango",
    "flask",
    "stout_shield",
    "clarity",
    "enchanted_mango",
    "tpscroll"
]
[
    "stout_shield",
    "blight_stone",
    "tpscroll"
]
[
    "stout_shield",
    "quelling_blade",
    "tango",
    "tpscroll"
]
[
    "tango",
    "faerie_fire",
    "tpscroll"
]
[
    "soul_ring",
    "stout_shield",
    "magic_wand",
    "boots",
    "faerie_fire",
    "tpscroll"
]
[
    "boots",
    "bottle",
    "null_talisman",
    "helm_of_iron_will",
    "null_talisman"
]
[
    "tranquil_boots",
    "magic_stick",
    "orb_of_venom",
    "ward_observer"
]
[
    "power_treads",
    "quelling_blade",
    "clarity",
    "ring_of_health",
    "tpscroll"
]
[
    "ring_of_basili

In [8]:
import collections


def extract_features_csv(match):
    
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]

    for slot, player in enumerate(match['players']):
        if slot < 5:
            player_name = 'r%d' % (slot + 1)
        else:
            player_name = 'd%d' % (slot - 4)

        row.append( (f'{player_name}_items', list(map(lambda x: x['id'][5:], player['hero_inventory'])) ) )
        #here u can extract other data

    return collections.OrderedDict(row)

    
def extract_targets_csv(match, targets):
    return collections.OrderedDict([('match_id_hash', match['match_id_hash'])] + [
        (field, targets[field])
        for field in ['game_time', 'radiant_win', 'duration', 'time_remaining', 'next_roshan_team']
    ])

In [9]:
def create_features_from_jsonl(matches_file):
  
    df_new_features = []

    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']
        features = extract_features_csv(match)

        df_new_features.append(features)

    df_new_features = pd.DataFrame.from_records(df_new_features).set_index('match_id_hash')
    return df_new_features

In [10]:
train_df = create_features_from_jsonl(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')).fillna(0)

HBox(children=(IntProgress(value=0, max=39675), HTML(value='')))




In [11]:
test_df = create_features_from_jsonl(os.path.join(PATH_TO_DATA, 'test_matches.jsonl')).fillna(0)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [12]:
import pickle as pkl

#Better to save extracted data in files, because extracting takes time...
train_df.to_pickle('df_train.pkl')
test_df.to_pickle('df_test.pkl')

In [13]:
def add_items_dummies(train_df, test_df):
    
    full_df = pd.concat([train_df, test_df], sort=False)
    train_size = train_df.shape[0]

    for team in 'r', 'd':
        players = [f'{team}{i}' for i in range(1, 6)]
        item_columns = [f'{player}_items' for player in players]

        d = pd.get_dummies(full_df[item_columns[0]].apply(pd.Series).stack()).sum(level=0, axis=0)
        dindexes = d.index.values

        for c in item_columns[1:]:
            d = d.add(pd.get_dummies(full_df[c].apply(pd.Series).stack()).sum(level=0, axis=0), fill_value=0)
            d = d.ix[dindexes]

        full_df = pd.concat([full_df, d.add_prefix(f'{team}_item_')], axis=1, sort=False)
        full_df.drop(columns=item_columns, inplace=True)

    train_df = full_df.iloc[:train_size, :]
    test_df = full_df.iloc[train_size:, :]

    return train_df, test_df

In [14]:
def drop_consumble_items(train_df, test_df):
    
    full_df = pd.concat([train_df, test_df], sort=False)
    train_size = train_df.shape[0]

    for team in 'r', 'd':
        consumble_columns = ['tango', 'tpscroll', 
                             'bottle', 'flask',
                            'enchanted_mango', 'clarity',
                            'faerie_fire', 'ward_observer',
                            'ward_sentry']
        
        starts_with = f'{team}_item_'
        consumble_columns = [starts_with + column for column in consumble_columns]
        full_df.drop(columns=consumble_columns, inplace=True)

    train_df = full_df.iloc[:train_size, :]
    test_df = full_df.iloc[train_size:, :]

    return train_df, test_df

In [15]:
%%time
new_train = pd.read_pickle('df_train.pkl')
new_test = pd.read_pickle('df_test.pkl')

new_train, new_test = add_items_dummies(new_train, new_test)
new_train, new_test = drop_consumble_items(new_train, new_test)

target = pd.DataFrame(y)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


Wall time: 1min 31s


In [16]:
new_test

Unnamed: 0,r_item_abyssal_blade,r_item_aegis,r_item_aeon_disk,r_item_aether_lens,r_item_ancient_janggo,r_item_arcane_boots,r_item_armlet,r_item_assault,r_item_basher,r_item_belt_of_strength,...,d_item_urn_of_shadows,d_item_vanguard,d_item_veil_of_discord,d_item_vitality_booster,d_item_vladmir,d_item_void_stone,d_item_ward_dispenser,d_item_wind_lace,d_item_wraith_band,d_item_yasha
30cc2d778dca82f2edb568ce9b585caa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
70e5ba30f367cea48793b9003fab9d38,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4d9ef74d3a2025d79e9423105fd73d41,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2bb79e0c1eaac1608e5a09c8e0c6a555,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bec17f099b01d67edc82dfb5ce735a43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
038acbb47d9eb54c11962d07cce8d829,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b891f03bb2a86d78b84043437fc95e04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
ff898afdb5bb5c7163bde45a009503f7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
72e6b1c31c718c0806f9aaeb8b7290a9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
abd97ab60ceca3ae6f00a28baea8a43c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [17]:
new_test.to_csv('test_items.csv', index=new_test.index)

ValueError: The truth value of a Index is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
new_test.head()

In [None]:
new_test['match_id_hash']=new_test.index

In [None]:
new_test.head()

In [None]:
new_test = new_test.reset_index(drop=True)

In [None]:
new_test = new_test.set_index('match_id_hash')

In [None]:
new_test.to_csv('test_items.csv')

In [None]:
new_train.head()

In [None]:
new_train['match_id_hash']=new_train.index

In [None]:
new_train = new_train.set_index('match_id_hash')

In [None]:
new_train.head()

In [None]:
new_train.to_csv('train_items.csv')