In [0]:
from google.colab import drive
drive.mount('/content/drive') # force_remount=True для переподключения

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import scipy
import sys
sys.path.append("..")
import os
pd.set_option("max_columns", 10000)

%pylab inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import json
from IPython import display

from tqdm import tqdm, tqdm_notebook, tqdm_pandas
tqdm.pandas()

import copy

from datetime import datetime

def submit(pred, base_name="", pred_path="/home/furfa/work/ai-academy2019/predictions"):
    date = str(datetime.now())
    name = f"{base_name}[{date}].csv"
    path = os.path.join(pred_path, name)
    pred.to_csv(path, index = None) # 40 баллов
    print("File saved in :",path)

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import json

import featuretools as ft

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

Populating the interactive namespace from numpy and matplotlib


In [0]:
!ls

data  drive  predictions  sample_data


In [0]:
from sklearn.preprocessing import LabelEncoder
def encode_columns(df, columns):
    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].values)

unique_roles = set()
def onehot_lists(series_lists):

    def str_to_list(s):
        global unique_roles
        s = s[1:-1].split("', '")
        s[0] = s[0][1:]
        s[-1] = s[-1][:-1]
        s = set(s)
        unique_roles = unique_roles | s
        return s

    series_lists = series_lists.apply(str_to_list)

    new_data = {role:list() for role in unique_roles}

    for role in unique_roles:
        for l in series_lists:
            new_data[role].append( role in l )

    new_data = pd.DataFrame(new_data)

    return new_data

def make_diff_shifts(row, n=1):
    """
    row - np-array
    """
    if n == 0:
        return row

    return row[n:]- row[:-n]

def linreg_trend(Y):
    """
    return a,b in solution to y = ax + b such that root mean square distance between trend line and original points is minimized
    """
    X = range(len(Y))

    N = len(X)
    Sx = Sy = Sxx = Syy = Sxy = 0.0
    for x, y in zip(X, Y):
        Sx = Sx + x
        Sy = Sy + y
        Sxx = Sxx + x*x
        Syy = Syy + y*y
        Sxy = Sxy + x*y
    det = Sxx * N - Sx * Sx

    trend_a = (Sxy * N - Sy * Sx)/det
    trend_b = (Sxx * Sy - Sx * Sxy)/det
    return trend_a
  
  
  
def generate_features(data, var_types, 
                      trans_primitives=["multiply",'divide', "diff"], N_FEATURES=1000, 
                      index_col_name="id"):
    data = data.copy()
    
    print("-"*15)

    start_columns = data.columns
    
    data = data.reset_index()
    data[index_col_name] = data[index_col_name].astype(np.int64)
    
    N_FEATURES += data.shape[1]
    
    es = ft.EntitySet(id='players')
    
    main_entity_id = 'train_players'

    # Entities with a unique index
    es = es.entity_from_dataframe(
        entity_id=main_entity_id, 
        dataframe=data, # dataframe object
        index=index_col_name, # unique index
        variable_types=var_types
    )

    print(es)
    
    # DFS with specified primitives
    print("Start dfs")

    features, feature_names = ft.dfs(
        entityset=es, 
        target_entity=main_entity_id,
        trans_primitives = trans_primitives,
        agg_primitives=[], 
        max_depth=1, 
        features_only=False,
        verbose=True,
        chunk_size=0.5,
        max_features=N_FEATURES, # comment it later, computational burden reduction
        n_jobs=-1,
    )
    return features.drop(start_columns, axis=1)

In [0]:
input_dir = "data"
base_dir = "."
pred_dir = os.path.join(base_dir, "predictions")
processed_dir = os.path.join(input_dir, "processed")

In [0]:
!ls {processed_dir}

test_all_JSON.csv	      train_all_JSON.csv
test_timeseries.csv	      train_timeseries.csv
test_timeseries_no_zeros.csv  train_timeseries_no_zeros.csv


# Series zeros

In [0]:
def get_data(data_json_series, shifts, func_to_aggregate):


    data_series = {
        f"{it}->{s}_{func.__name__}":list() for func in func_to_aggregate
                            for it in ("radiant", "dire", "time", "player")
                            for s in shifts

    }
    data_series["id"] = list()

    print(f"making df with shape : {len(data_series.keys())}")

    for ind in tqdm(data_json_series.index):
        ser = json.loads(
            data_json_series[ind]
        )

        data_series["id"].append(ind)

        sootv = {
            "radiant" : np.array(ser["radiant_gold"]),
            "dire" : np.array(ser["dire_gold"]),
            "time" : np.array(ser["time"]),
            "player" : np.array(ser["player_gold"]),
        }

        for it in ("radiant", "dire", "time", "player"):
            for func in func_to_aggregate:
                for s in shifts:
                    data_series[f"{it}->{s}_{func.__name__}"].append(
                                                                func(
                                                                    make_diff_shifts(
                                                                        sootv[it],
                                                                        n=s
                                                                        )
                                                                    )
                                                                )
    return pd.DataFrame(data_series).set_index("id")




func_to_aggregate = [
    np.max,
    scipy.stats.mode,
    np.mean,
    np.var,
    np.std,
    np.sum,
    scipy.stats.skew,
    linreg_trend,
]

shifts = [
    0,1
]

print("Reading")

train_ser = pd.read_csv( os.path.join(input_dir,"processed", 'train_all_JSON.csv'), index_col=0 )["series"]
test_ser = pd.read_csv( os.path.join(input_dir, "processed", 'test_all_JSON.csv'), index_col=0 )["series"]

print("making train")
new_data_train = get_data(train_ser, shifts, func_to_aggregate)

print("making test")
new_data_test = get_data(test_ser, shifts, func_to_aggregate)

print("saving")

new_data_train.to_csv( os.path.join(input_dir,"processed", 'train_timeseries.csv') )

new_data_test.to_csv( os.path.join(input_dir,"processed", 'test_timeseries.csv') )

print("Ezzz")

Reading


  0%|          | 11/40403 [00:00<06:37, 101.52it/s]

making train
making df with shape : 65


100%|██████████| 40403/40403 [06:16<00:00, 107.45it/s]
  0%|          | 11/15836 [00:00<02:28, 106.33it/s]

making test
making df with shape : 65


100%|██████████| 15836/15836 [02:27<00:00, 107.13it/s]


saving
Ezzz


# Series no zeros

In [0]:
def get_slice_no_zero(arr):
    for i in range(len(arr)):
        if arr[i] != 0:
            return i
    return 0

def get_data_nz(data_json_series, shifts, func_to_aggregate):


    data_series = {
        f"{it}->{s}_{func.__name__}":list() for func in func_to_aggregate
                            for it in ("radiant", "dire", "time", "player")
                            for s in shifts

    }
    data_series["id"] = list()

    print(f"making df with shape : {len(data_series.keys())}")

    for ind in tqdm(data_json_series.index):
        ser = json.loads(
            data_json_series[ind]
        )

        data_series["id"].append(ind)

        no_zero_slice = get_slice_no_zero(ser["time"])

        sootv = {
            "radiant" : np.array(ser["radiant_gold"])[no_zero_slice:],
            "dire" : np.array(ser["dire_gold"])[no_zero_slice:],
            "time" : np.array(ser["time"])[no_zero_slice:],
            "player" : np.array(ser["player_gold"])[no_zero_slice:],
        }

        for it in ("radiant", "dire", "time", "player"):
            for func in func_to_aggregate:
                for s in shifts:
                    data_series[f"{it}->{s}_{func.__name__}"].append(
                                                                func(
                                                                    make_diff_shifts(
                                                                        sootv[it],
                                                                        n=s
                                                                        )
                                                                    )
                                                                )
    return pd.DataFrame(data_series).set_index("id")



func_to_aggregate = [
    np.max,
    np.mean,
    np.var,
    np.std,
    np.sum,
    linreg_trend,
]

shifts = [
    0,1
]

print("making train")
new_data_train = get_data_nz(train_ser, shifts, func_to_aggregate)

print("making test")
new_data_test = get_data_nz(test_ser, shifts, func_to_aggregate)

print("saving")

new_data_train.to_csv( os.path.join(input_dir,"processed", 'train_timeseries_no_zeros.csv') )

new_data_test.to_csv( os.path.join(input_dir,"processed", 'test_timeseries_no_zeros.csv') )

print("Ezzz")

In [0]:
def level_up_features(data_json_series, shifts, func_to_aggregate):

    it = "level"

    data_series = {
        f"{it}->{s}_{func.__name__}":list() for func in func_to_aggregate
                            for s in shifts

    }
    data_series["id"] = list()

    print(f"making df with shape : {len(data_series.keys())}")

    for ind in tqdm(data_json_series.index):
        ser = json.loads(
            data_json_series[ind]
        )

        data_series["id"].append(ind)

        sootv = {
            "level" : np.array(ser),
        }

        for func in func_to_aggregate:
            for s in shifts:
                try:
                    to_append =  func(
                                    make_diff_shifts(
                                        sootv[it],
                                        n=s
                                        )
                                    )
                except:
                    to_append = np.nan

                data_series[f"{it}->{s}_{func.__name__}"].append(to_append)
    return pd.DataFrame(data_series).set_index("id")
  
func_to_aggregate = [
      np.max,
      np.mean,
      np.std,
      np.sum,
      linreg_trend,
  ]

shifts = [
    0,1
]

print("Reading")

train_ser = pd.read_csv( os.path.join(input_dir,"processed", 'train_all_JSON.csv'), index_col=0 ).level_up_times
test_ser = pd.read_csv( os.path.join(input_dir, "processed", 'test_all_JSON.csv'), index_col=0 ).level_up_times

print("making train")
new_data_train = level_up_features(
        train_ser,
        shifts,
        func_to_aggregate
)

print("making test")
new_data_test = level_up_features(
        test_ser,
        shifts,
        func_to_aggregate
)
print("saving")

new_data_train.to_csv( os.path.join(input_dir,"processed", 'train_levelup.csv') )

new_data_test.to_csv( os.path.join(input_dir,"processed", 'test_levelup.csv') )

print("Ezzz")

Reading


  1%|          | 394/40403 [00:00<00:10, 3933.85it/s]

making train
making df with shape : 11


100%|██████████| 40403/40403 [00:10<00:00, 3838.82it/s]
  3%|▎         | 410/15836 [00:00<00:03, 4098.82it/s]

making test
making df with shape : 11


100%|██████████| 15836/15836 [00:04<00:00, 3716.61it/s]


saving
Ezzz
