In [1]:
import ujson as json
from tqdm import tqdm_notebook
import os #to access files
from scipy.sparse import hstack
import time
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler

In [4]:
PATH_TO_DATA = '../data/'

In [5]:
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            'train_targets.csv'), 
                                   index_col='match_id_hash')

In [6]:
y = df_train_targets['radiant_win'].values

In [7]:
def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)

In [8]:
json_list = [] #store data that are read
number_of_rows = 50 #how many lines to read 

#reading data from .jsonl file
with open(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')) as fin:
    for i in range(number_of_rows):
        line = fin.readline()
        json_list.append(json.loads(line))
        
#how many matches to read. For example I took 1
for i in range(1, 2):
  for j in range(1, 2):#there is 5 players in each team. But I want to look on only one player.
    print(json.dumps(json_list[i]['players'][j]['damage'], indent=4, sort_keys=True))

{
    "npc_dota_badguys_siege":381,
    "npc_dota_creep_badguys_melee":5062,
    "npc_dota_creep_badguys_ranged":2595,
    "npc_dota_creep_goodguys_melee":110,
    "npc_dota_creep_goodguys_ranged":194,
    "npc_dota_hero_antimage":1414,
    "npc_dota_hero_centaur":2496,
    "npc_dota_hero_clinkz":155,
    "npc_dota_hero_enchantress":1031,
    "npc_dota_neutral_centaur_khan":338,
    "npc_dota_neutral_centaur_outrunner":195,
    "npc_dota_neutral_enraged_wildkin":253,
    "npc_dota_neutral_satyr_hellcaller":253,
    "npc_dota_neutral_wildkin":506
}


In [10]:
for i in range(1, 5): #now we will look at 4 matches
  for j in range(1, 5):#and now will take 5 players
    print(json.dumps(list(map(lambda x: x['id'][5:], json_list[i]['players'][j]['damage'])), indent=4, sort_keys=True))

TypeError: string indices must be integers

In [None]:
import collections


def extract_features_csv(match):
    
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]

    for slot, player in enumerate(match['players']):
        if slot < 5:
            player_name = 'r%d' % (slot + 1)
        else:
            player_name = 'd%d' % (slot - 4)

        row.append( (f'{player_name}_items', list(map(lambda x: x['id'][5:], player['hero_inventory'])) ) )
        #here u can extract other data

    return collections.OrderedDict(row)

    
def extract_targets_csv(match, targets):
    return collections.OrderedDict([('match_id_hash', match['match_id_hash'])] + [
        (field, targets[field])
        for field in ['game_time', 'radiant_win', 'duration', 'time_remaining', 'next_roshan_team']
    ])

In [None]:
def create_features_from_jsonl(matches_file):
  
    df_new_features = []

    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash']
        features = extract_features_csv(match)

        df_new_features.append(features)

    df_new_features = pd.DataFrame.from_records(df_new_features).set_index('match_id_hash')
    return df_new_features

In [None]:
train_df = create_features_from_jsonl(os.path.join(PATH_TO_DATA, 'train_matches.jsonl')).fillna(0)

In [None]:
test_df = create_features_from_jsonl(os.path.join(PATH_TO_DATA, 'test_matches.jsonl')).fillna(0)

In [None]:
import pickle as pkl

#Better to save extracted data in files, because extracting takes time...
train_df.to_pickle('df_train.pkl')
test_df.to_pickle('df_test.pkl')

In [None]:
def add_items_dummies(train_df, test_df):
    
    full_df = pd.concat([train_df, test_df], sort=False)
    train_size = train_df.shape[0]

    for team in 'r', 'd':
        players = [f'{team}{i}' for i in range(1, 6)]
        item_columns = [f'{player}_items' for player in players]

        d = pd.get_dummies(full_df[item_columns[0]].apply(pd.Series).stack()).sum(level=0, axis=0)
        dindexes = d.index.values

        for c in item_columns[1:]:
            d = d.add(pd.get_dummies(full_df[c].apply(pd.Series).stack()).sum(level=0, axis=0), fill_value=0)
            d = d.ix[dindexes]

        full_df = pd.concat([full_df, d.add_prefix(f'{team}_item_')], axis=1, sort=False)
        full_df.drop(columns=item_columns, inplace=True)

    train_df = full_df.iloc[:train_size, :]
    test_df = full_df.iloc[train_size:, :]

    return train_df, test_df

In [None]:
def drop_consumble_items(train_df, test_df):
    
    full_df = pd.concat([train_df, test_df], sort=False)
    train_size = train_df.shape[0]

    for team in 'r', 'd':
        consumble_columns = ['tango', 'tpscroll', 
                             'bottle', 'flask',
                            'enchanted_mango', 'clarity',
                            'faerie_fire', 'ward_observer',
                            'ward_sentry']
        
        starts_with = f'{team}_item_'
        consumble_columns = [starts_with + column for column in consumble_columns]
        full_df.drop(columns=consumble_columns, inplace=True)

    train_df = full_df.iloc[:train_size, :]
    test_df = full_df.iloc[train_size:, :]

    return train_df, test_df

In [None]:
%%time
new_train = pd.read_pickle('df_train.pkl')
new_test = pd.read_pickle('df_test.pkl')

new_train, new_test = add_items_dummies(new_train, new_test)
new_train, new_test = drop_consumble_items(new_train, new_test)

target = pd.DataFrame(y)

In [None]:
new_test

In [None]:
new_test.to_csv('test_items.csv', index=new_test.index)

In [None]:
new_test.head()

In [None]:
new_test['match_id_hash']=new_test.index

In [None]:
new_test.head()

In [None]:
new_test = new_test.reset_index(drop=True)

In [None]:
new_test = new_test.set_index('match_id_hash')

In [None]:
new_test.to_csv('test_items.csv')

In [None]:
new_train.head()

In [None]:
new_train['match_id_hash']=new_train.index

In [None]:
new_train = new_train.set_index('match_id_hash')

In [None]:
new_train.head()

In [None]:
new_train.to_csv('train_items.csv')