In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import scipy
import sys
sys.path.append("..")
import os
pd.set_option("max_columns", 10000)

%pylab inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import json
from IPython import display

from tqdm import tqdm, tqdm_notebook, tqdm_pandas
tqdm.pandas()

import copy

from datetime import datetime

def submit(pred, base_name="", pred_path="/home/furfa/work/ai-academy2019/predictions"):
    date = str(datetime.now())
    name = f"{base_name}[{date}].csv"
    path = os.path.join(pred_path, name)
    pred.to_csv(path, index = None) # 40 баллов
    print("File saved in :",path)

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import json

import featuretools as ft

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

Populating the interactive namespace from numpy and matplotlib


ModuleNotFoundError: No module named 'featuretools'

In [None]:
from sklearn.preprocessing import LabelEncoder
def encode_columns(df, columns):
    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].values)

unique_roles = set()
def onehot_lists(series_lists):

    def str_to_list(s):
        global unique_roles
        s = s[1:-1].split("', '")
        s[0] = s[0][1:]
        s[-1] = s[-1][:-1]
        s = set(s)
        unique_roles = unique_roles | s
        return s

    series_lists = series_lists.apply(str_to_list)

    new_data = {role:list() for role in unique_roles}

    for role in unique_roles:
        for l in series_lists:
            new_data[role].append( role in l )

    new_data = pd.DataFrame(new_data)

    return new_data

def make_diff_shifts(row, n=1):
    """
    row - np-array
    """
    if n == 0:
        return row

    return row[n:]- row[:-n]

def linreg_trend(Y):
    """
    return a,b in solution to y = ax + b such that root mean square distance between trend line and original points is minimized
    """
    X = range(len(Y))

    N = len(X)
    Sx = Sy = Sxx = Syy = Sxy = 0.0
    for x, y in zip(X, Y):
        Sx = Sx + x
        Sy = Sy + y
        Sxx = Sxx + x*x
        Syy = Syy + y*y
        Sxy = Sxy + x*y
    det = Sxx * N - Sx * Sx

    trend_a = (Sxy * N - Sy * Sx)/det
    trend_b = (Sxx * Sy - Sx * Sxy)/det
    return trend_a
  
  
  
def generate_features(data, var_types, 
                      trans_primitives=["multiply",'divide', "diff"], N_FEATURES=1000, 
                      index_col_name="id"):
    data = data.copy()
    
    print("-"*15)

    start_columns = data.columns
    
    data = data.reset_index()
    data[index_col_name] = data[index_col_name].astype(np.int64)
    
    N_FEATURES += data.shape[1]
    
    es = ft.EntitySet(id='players')
    
    main_entity_id = 'test_players'

    # Entities with a unique index
    es = es.entity_from_dataframe(
        entity_id=main_entity_id, 
        dataframe=data, # dataframe object
        index=index_col_name, # unique index
        variable_types=var_types
    )

    print(es)
    
    # DFS with specified primitives
    print("Start dfs")

    features, feature_names = ft.dfs(
        entityset=es, 
        target_entity=main_entity_id,
        trans_primitives = trans_primitives,
        agg_primitives=[], 
        max_depth=1, 
        features_only=False,
        verbose=True,
        chunk_size=0.5,
        max_features=N_FEATURES, # comment it later, computational burden reduction
        n_jobs=-1,
    )
    return features.drop(start_columns, axis=1)

## Reading

In [0]:
!mkdir data
!mkdir predictions
!bash -c "cd data; wget https://s3.eu-central-1.amazonaws.com/ai-academy-2019/public/final/academy2019_final_train.jsonlines"
!bash -c "cd data; wget https://s3.eu-central-1.amazonaws.com/ai-academy-2019/public/final/academy2019_final_test.jsonlines"

--2019-04-20 09:53:12--  https://s3.eu-central-1.amazonaws.com/ai-academy-2019/public/final/academy2019_final_train.jsonlines
Resolving s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)... 52.219.73.4
Connecting to s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)|52.219.73.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4518189834 (4.2G) [binary/octet-stream]
Saving to: ‘academy2019_final_train.jsonlines’


2019-04-20 09:57:56 (15.2 MB/s) - ‘academy2019_final_train.jsonlines’ saved [4518189834/4518189834]

--2019-04-20 09:57:59--  https://s3.eu-central-1.amazonaws.com/ai-academy-2019/public/final/academy2019_final_test.jsonlines
Resolving s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)... 52.219.73.147
Connecting to s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)|52.219.73.147|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1769850921 (1.6G) [binary/octet-stream]
Saving to: ‘acade

In [17]:
input_dir = "data"
base_dir = "."
pred_dir = os.path.join(base_dir, "predictions")
processed_dir = os.path.join(input_dir, "processed")

mkdir: cannot create directory ‘predictions’: File exists


In [0]:
def get_data(data_json_series, shifts, func_to_aggregate):


    data_series = {
        f"{it}->{s}_{func.__name__}":list() for func in func_to_aggregate
                            for it in ("radiant", "dire", "time", "player")
                            for s in shifts

    }
    data_series["id"] = list()

    print(f"making df with shape : {len(data_series.keys())}")

    for ind in tqdm(data_json_series.index):
        ser = json.loads(
            data_json_series[ind]
        )

        data_series["id"].append(ind)

        sootv = {
            "radiant" : np.array(ser["radiant_gold"]),
            "dire" : np.array(ser["dire_gold"]),
            "time" : np.array(ser["time"]),
            "player" : np.array(ser["player_gold"]),
        }

        for it in ("radiant", "dire", "time", "player"):
            for func in func_to_aggregate:
                for s in shifts:
                    data_series[f"{it}->{s}_{func.__name__}"].append(
                                                                func(
                                                                    make_diff_shifts(
                                                                        sootv[it],
                                                                        n=s
                                                                        )
                                                                    )
                                                                )
    return pd.DataFrame(data_series).set_index("id")


if __name__ == "__main__":



    func_to_aggregate = [
        np.max,
        scipy.stats.mode,
        np.mean,
        np.var,
        np.std,
        np.sum,
        scipy.stats.skew,
        linreg_trend,
    ]

    shifts = [
        0,1,3
    ]

    print("Reading")

    train_ser = pd.read_csv( os.path.join(input_dir,"processed", 'train_all_JSON.csv'), index_col=0 )["series"]
    test_ser = pd.read_csv( os.path.join(input_dir, "processed", 'test_all_JSON.csv'), index_col=0 )["series"]

    print("making train")
    new_data_train = get_data(train_ser, shifts, func_to_aggregate)

    print("making test")
    new_data_test = get_data(test_ser, shifts, func_to_aggregate)

    print("saving")

    new_data_train.to_csv( os.path.join(input_dir,"processed", 'train_timeseries.csv') )

    new_data_test.to_csv( os.path.join(input_dir,"processed", 'test_timeseries.csv') )

print("Ezzz")

In [18]:
!ls data

academy2019_final_test.jsonlines  academy2019_final_train.jsonlines


In [0]:
train_json = "data/academy2019_final_train.jsonlines"
test_json = "data/academy2019_final_test.jsonlines"

In [33]:
json_in_df_name_flag = True
json_in_df_names = set()

def process_line(record):
    global json_in_df_name_flag
    for k in record.keys():
        item = record[k]
        if type( item ) == dict or type( item ) == list:
            record[k] = json.dumps(item)

            if json_in_df_name_flag:
                json_in_df_names.add(k)
    json_in_df_name_flag = False
    return record

def process_file(f_name):
    with open(f_name) as file:
        answer = list()
        for line in tqdm(file):
            rec= json.loads(line)
            # обработка record
            answer.append(
                process_line(rec)
            )
    return pd.DataFrame(answer).set_index("id")

def parse_all_json():
    print("input dir is ", input_dir)

    # Processing
    print("Processing train...")
    train = process_file(train_json)
    print("Processing test...")
    test = process_file(test_json)
    print("This cols are json: ",json_in_df_names)

    print("Saving")

    try:
        processed_dir = os.path.join(input_dir, "processed")
        os.mkdir(processed_dir)
    except:
        pass
    finally:
        train.to_csv(
            os.path.join(processed_dir, "train_all_JSON.csv")
        )
        test.to_csv(
            os.path.join(processed_dir, "test_all_JSON.csv")
)
parse_all_json()

15it [00:00, 143.74it/s]

input dir is  data
Processing train...


40403it [04:59, 135.03it/s]
13it [00:00, 124.21it/s]

Processing test...


15836it [02:12, 119.34it/s]


This cols are json:  {'ability_upgrades', 'series', 'kills_log', 'radiant_heroes', 'level_up_times', 'actions', 'pings', 'item_purchase_log', 'damage_targets', 'deaths_log', 'gold_by_reason', 'final_items', 'dire_heroes'}
Saving


In [41]:
!ls data/processed

test_all_JSON.csv	      train_all_JSON.csv
test_levelup.csv	      train_levelup.csv
test_timeseries.csv	      train_timeseries.csv
test_timeseries_no_zeros.csv  train_timeseries_no_zeros.csv


{
      'ability_upgrades', 
      'series', 'kills_log', 
      'radiant_heroes',
      'level_up_times', 
      'actions', 'pings',
      'item_purchase_log', 
      'damage_targets',
      'deaths_log',
      'gold_by_reason',
      'final_items',
      'dire_heroes'
}

In [5]:
!ls data/processed

test_all_JSON.csv	      train_all_JSON.csv
test_damage_targets.csv       train_damage_targets.csv
test_item_purchase_log.csv    train_item_purchase_log.csv
test_levelup.csv	      train_levelup.csv
test_timeseries.csv	      train_timeseries.csv
test_timeseries_no_zeros.csv  train_timeseries_no_zeros.csv


In [26]:
#drp_json = [ 'ability_upgrades', 'series', 'kills_log', 'radiant_heroes', 'level_up_times', 'actions', 'pings', 'item_purchase_log', 'damage_targets', 'deaths_log', 'gold_by_reason', 'final_items', 'dire_heroes']
train = pd.read_csv("academy2019_final_train.csv", index_col=0)#.drop(drp_json, axis=1)
test = pd.read_csv("academy2019_final_test.csv", index_col=0)#.drop(drp_json, axis=1)

In [27]:
train_damage_targets = pd.read_csv("train_damage_targets.csv", index_col=0)
train_level = pd.read_csv("train_levelup.csv", index_col=0)
train_timeseries_no_zeros = pd.read_csv("train_timeseries_no_zeros.csv", index_col=0).add_prefix("nozer")
train_timeseries = pd.read_csv("train_timeseries.csv", index_col=0)
train_item_purchase_log = pd.read_csv("train_item_purchase_log.csv", index_col=0)


In [28]:
train_final = pd.concat([
    train,
    train_damage_targets,
    train_level,
    train_timeseries,
    train_timeseries_no_zeros,
    train_item_purchase_log,
], axis=1)

In [9]:
!pip install catboost==0.13.1



In [29]:
test_damage_targets = pd.read_csv("test_damage_targets.csv", index_col=0)
test_level = pd.read_csv("test_levelup.csv", index_col=0)
test_timeseries_no_zeros = pd.read_csv("test_timeseries_no_zeros.csv", index_col=0).add_prefix("nozer")
test_timeseries = pd.read_csv("test_timeseries.csv", index_col=0)
test_item_purchase_log = pd.read_csv("test_item_purchase_log.csv", index_col=0)

In [30]:
test_final = pd.concat([
    test,
    test_damage_targets,
    test_level,
    test_timeseries,
    test_timeseries_no_zeros,
    test_item_purchase_log,
], axis=1)

In [31]:
train_final.shape, test_final.shape

((40403, 944), (15836, 943))

In [32]:
y_train = train_final.skilled
del train_final['skilled']

In [33]:
train_final.shape

(40403, 943)

In [34]:
test_final.shape

(15836, 943)

In [35]:
cols = list( set(test_final.columns).intersection(train_final.columns) )

In [36]:
train_final = train_final[cols]
test_final = test_final[cols]

In [37]:
to_delete = [col for col in train_final.columns if "mode" in col]
train_final.drop(to_delete, axis=1, inplace=True)
test_final.drop(to_delete, axis=1, inplace=True)

In [38]:
train_final.winner_team = train_final[train_final.winner_team == "radiant"]
train_final.player_team = train_final[train_final.player_team == "radiant"]

In [39]:
test_final.winner_team = test_final[test_final.winner_team == "radiant"]
test_final.player_team = test_final[test_final.player_team == "radiant"]

In [40]:
all( train_final.columns == test_final.columns)

True

In [45]:
to_drp = ["player_team", "winner_team"]
train_final.drop(to_drp, axis=1, inplace=True)
test_final.drop(to_drp, axis=1, inplace=True)

In [46]:
import catboost as cb
import lightgbm as lgb
#model = cb.CatBoostClassifier(logging_level="Info").fit(train_final, y_train)

In [65]:
model = lgb.LGBMClassifier(
    **{'subsample_freq': 1,
 'subsample_for_bin': 200,
 'subsample': 0.7,
 'scale_pos_weight': 1,
 'reg_lambda': 1.5,
 'reg_alpha': 1.4,
 'random_state': 501,
 'objective': 'binary',
 'num_leaves': 16,
 'num_iterations': 2500,
 'num_class': 1,
 'nthread': 5,
 'n_estimators': 170,
 'min_split_gain': 0.5,
 'min_child_weight': 1,
 'min_child_samples': 5,
 'metric': 'binary_error',
 'max_depth': 2,
 'max_bin': 512,
 'learning_rate': 0.2,
 'colsample_bytree': 0.64,
 'boosting_type': 'gbdt'}).fit(train_final, y_train)

In [None]:
model = lgb.LGBMClassifier(
    learning_rate=0.25, n_estimators=1000, reg_lambda=5, num_leaves=50).fit(train_final, y_train)

In [None]:
lr 03
est 1000
leaves 5
lambda 5

In [66]:
y_pred = model.predict_proba(
    test_final
)[:,1]

y_pred = pd.DataFrame({'id' : test_final.index, 'skilled_prob': y_pred})
y_pred.to_csv("lgb_new2.csv")

['assists',
 'avg_assists_x16',
 'avg_deaths_x16',
 'avg_gpm_x16',
 'avg_kills_x16',
 'avg_xpm_x16',
 'best_assists_x16',
 'best_gpm_x16',
 'best_kills_x16',
 'best_win_streak',
 'best_xpm_x16',
 'camps_stacked',
 'creeps_stacked',
 'deaths',
 'denies',
 'dire_barracks_status',
 'dire_tower_status',
 'duration',
 'farm_score',
 'fight_score',
 'first_blood_claimed',
 'first_blood_time',
 'gold',
 'gold_per_min',
 'gold_spent',
 'hero_damage',
 'hero_healing',
 'hero_id',
 'hero_pick_order',
 'kills',
 'last_hits',
 'leaver_status',
 'level',
 'nearby_creep_death_count',
 'net_worth',
 'observer_wards_placed',
 'party_players',
 'player_team',
 'pre_game_duration',
 'push_score',
 'radiant_barracks_status',
 'radiant_tower_status',
 'roshan_kills',
 'rune_pickups',
 'scaled_hero_damage',
 'scaled_hero_healing',
 'scaled_tower_damage',
 'sentry_wards_placed',
 'stuns',
 'support_score',
 'team_fight_participation',
 'tower_damage',
 'tower_kills',
 'wards',
 'win_streak',
 'winner_team',

In [97]:
train_final.shape

(40403, 936)

In [99]:
test_final.shape

(15836, 936)

In [103]:
all( train_final.columns == test_final[train_final.columns].columns )

ValueError: ignored

In [104]:
test_final[train_final.columns].shape

(15836, 1032)