In [4]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
# import gresearch_crypto
import time
import datetime

In [3]:
# !pip install lightgbm

In [5]:
import pickle
import gc

from tqdm import tqdm

In [6]:
n_fold = 7
seed0 = 8586
use_supple_for_train = True

# If True, the period used to evaluate Public LB will not be used for training.
# Set to False on final submission.
not_use_overlap_to_train = False

In [8]:
TRAIN_CSV = 'data/train.csv'
SUPPLE_TRAIN_CSV = 'data/supplemental_train.csv'
ASSET_DETAILS_CSV = 'data/asset_details.csv'

pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 350)

In [9]:
lags = [60,300,900]

In [10]:
params = {
    'early_stopping_rounds': 50,
    'objective': 'regression',
    'metric': 'rmse',
#     'metric': 'None',
    'boosting_type': 'gbdt',
    'max_depth': 5,
    'verbose': -1,
    'max_bin':600,
    'min_data_in_leaf':50,
    'learning_rate': 0.03,
    'subsample': 0.7,
    'subsample_freq': 1,
    'feature_fraction': 1,
    'lambda_l1': 0.5,
    'lambda_l2': 2,
    'seed':seed0,
    'feature_fraction_seed': seed0,
    'bagging_fraction_seed': seed0,
    'drop_seed': seed0,
    'data_random_seed': seed0,
    'extra_trees': True,
    'extra_seed': seed0,
    'zero_as_missing': True,
    "first_metric_only": True
         }

In [11]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [12]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")

In [13]:
df_asset_details.head()

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin


In [17]:
%%time
df_train = pd.read_csv(TRAIN_CSV, usecols=['timestamp','Asset_ID', 'Close', 'Target'])

if use_supple_for_train:    
    df_supple = pd.read_csv(SUPPLE_TRAIN_CSV, usecols=['timestamp','Asset_ID', 'Close', 'Target'])
#     display(df_supple)
    df_train = pd.concat([df_train, df_supple])
    del df_supple
df_train = reduce_mem_usage(df_train)

Memory usage of dataframe is 1020.63 MB
Memory usage after optimization is: 484.80 MB
Decreased by 52.5%
Wall time: 17.5 s


In [18]:
df_train.head()

Unnamed: 0,timestamp,Asset_ID,Close,Target
0,1514764860,2,2374.590088,-0.004219
1,1514764860,0,8.53,-0.014397
2,1514764860,1,13850.175781,-0.014641
3,1514764860,5,7.6576,-0.013924
4,1514764860,7,25.877001,-0.008263


In [20]:
%%time
train_merged = pd.DataFrame()
train_merged[df_train.columns] = 0
for id in tqdm( range(14) ):
    train_merged = train_merged.merge(df_train.loc[df_train["Asset_ID"] == id, ['timestamp', 'Close','Target']].copy(), on="timestamp", how='outer',suffixes=['', "_"+str(id)])
        
train_merged = train_merged.drop(df_train.columns.drop("timestamp"), axis=1)
train_merged = train_merged.sort_values('timestamp', ascending=True)
display(train_merged.head())

100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:13<00:00,  1.05it/s]


Unnamed: 0,timestamp,Close_0,Target_0,Close_1,Target_1,Close_2,Target_2,Close_3,Target_3,Close_4,Target_4,Close_5,Target_5,Close_6,Target_6,Close_7,Target_7,Close_8,Target_8,Close_9,Target_9,Close_10,Target_10,Close_11,Target_11,Close_12,Target_12,Close_13,Target_13
0,1514764860,8.53,-0.014397,13850.175781,-0.014641,2374.590088,-0.004219,,,,,7.6576,-0.013924,738.507507,-0.00481,25.877001,-0.008263,,,225.206665,-0.009789,,,329.459991,,,,,
1,1514764920,8.5145,-0.015869,13828.101562,-0.015038,2372.286621,-0.004078,,,,,7.6567,-0.014534,738.26001,-0.00444,25.896999,-0.029907,,,224.856674,-0.012993,,,329.450012,-0.009689,,,,
2,1514764980,8.4848,-0.015411,13801.314453,-0.010307,2372.063232,-0.002892,,,,,7.6512,-0.012543,737.502502,-0.004204,26.4695,-0.030838,,,226.0,-0.003571,,,329.190002,0.006565,,,,
3,1514765040,8.5009,-0.012527,13768.040039,-0.009003,2370.56665,-0.003719,,,,,7.6358,-0.011169,737.102478,-0.002205,26.449499,-0.0289,,,224.274994,-0.002556,,,328.160004,-0.007298,,,,
4,1514765100,8.456,-0.005939,13724.914062,-0.00808,2370.17334,-0.002171,,,,,7.61,-0.006153,735.705017,-0.001744,26.437,-0.030914,,,223.820007,0.000531,,,327.429993,-0.01062,,,,


Wall time: 13.9 s


In [21]:
# forward fill
# Set an upper limit on the number of fills, since there may be long term gaps.
for id in range(14):
#     print(id, train_merged[f'Close_{id}'].isnull().sum())   # Number of missing before forward fill
    train_merged[f'Close_{id}'] = train_merged[f'Close_{id}'].fillna(method='ffill', limit=100)

In [22]:
def get_features(df, train=True):   
    if train == True:
        totimestamp = lambda s: np.int32(time.mktime(datetime.datetime.strptime(s, "%d/%m/%Y").timetuple()))
        valid_window = [totimestamp("12/03/2021")]
#         valid_window = [totimestamp("15/08/2021")]  #検証用
        df['train_flg'] = np.where(df['timestamp']>=valid_window[0], 0,1)

        supple_start_window = [totimestamp("22/09/2021")]
        if use_supple_for_train:
            df['train_flg'] = np.where(df['timestamp']>=supple_start_window[0], 1 ,df['train_flg']  )

   
    for id in range(14):    
        for lag in lags:
            df[f'log_close/mean_{lag}_id{id}'] = np.log( np.array(df[f'Close_{id}']) /  np.roll(np.append(np.convolve( np.array(df[f'Close_{id}']), np.ones(lag)/lag, mode="valid"), np.ones(lag-1)), lag-1)  )
            df[f'log_return_{lag}_id{id}']     = np.log( np.array(df[f'Close_{id}']) /  np.roll(np.array(df[f'Close_{id}']), lag)  )
    for lag in lags:
        df[f'mean_close/mean_{lag}'] =  np.mean(df.iloc[:,df.columns.str.startswith(f'log_close/mean_{lag}_id')], axis=1)
        df[f'mean_log_returns_{lag}'] = np.mean(df.iloc[:,df.columns.str.startswith(f'log_return_{lag}_id')] ,    axis=1)
        for id in range(14):
            df[f'log_close/mean_{lag}-mean_close/mean_{lag}_id{id}'] = np.array( df[f'log_close/mean_{lag}_id{id}']) - np.array( df[f'mean_close/mean_{lag}']  )
            df[f'log_return_{lag}-mean_log_returns_{lag}_id{id}']    = np.array( df[f'log_return_{lag}_id{id}'])     - np.array( df[f'mean_log_returns_{lag}'] )

    if train == True:
        for id in range(14):
            df = df.drop([f'Close_{id}'], axis=1)
        oldest_use_window = [totimestamp("12/01/2019")]
        df = df[  df['timestamp'] >= oldest_use_window[0]   ]

    return df

In [23]:
%%time
feat = get_features(train_merged)
feat

  df[f'log_close/mean_{lag}_id{id}'] = np.log( np.array(df[f'Close_{id}']) /  np.roll(np.append(np.convolve( np.array(df[f'Close_{id}']), np.ones(lag)/lag, mode="valid"), np.ones(lag-1)), lag-1)  )
  df[f'log_return_{lag}_id{id}']     = np.log( np.array(df[f'Close_{id}']) /  np.roll(np.array(df[f'Close_{id}']), lag)  )
  df[f'log_close/mean_{lag}_id{id}'] = np.log( np.array(df[f'Close_{id}']) /  np.roll(np.append(np.convolve( np.array(df[f'Close_{id}']), np.ones(lag)/lag, mode="valid"), np.ones(lag-1)), lag-1)  )
  df[f'log_return_{lag}_id{id}']     = np.log( np.array(df[f'Close_{id}']) /  np.roll(np.array(df[f'Close_{id}']), lag)  )
  df[f'log_close/mean_{lag}_id{id}'] = np.log( np.array(df[f'Close_{id}']) /  np.roll(np.append(np.convolve( np.array(df[f'Close_{id}']), np.ones(lag)/lag, mode="valid"), np.ones(lag-1)), lag-1)  )
  df[f'log_return_{lag}_id{id}']     = np.log( np.array(df[f'Close_{id}']) /  np.roll(np.array(df[f'Close_{id}']), lag)  )
  df[f'log_close/mean_{lag}_id{id}'] 

Wall time: 20 s


Unnamed: 0,timestamp,Target_0,Target_1,Target_2,Target_3,Target_4,Target_5,Target_6,Target_7,Target_8,Target_9,Target_10,Target_11,Target_12,Target_13,train_flg,log_close/mean_60_id0,log_return_60_id0,log_close/mean_300_id0,log_return_300_id0,log_close/mean_900_id0,log_return_900_id0,log_close/mean_60_id1,log_return_60_id1,log_close/mean_300_id1,log_return_300_id1,log_close/mean_900_id1,log_return_900_id1,log_close/mean_60_id2,log_return_60_id2,log_close/mean_300_id2,log_return_300_id2,log_close/mean_900_id2,log_return_900_id2,log_close/mean_60_id3,log_return_60_id3,log_close/mean_300_id3,log_return_300_id3,log_close/mean_900_id3,log_return_900_id3,log_close/mean_60_id4,log_return_60_id4,log_close/mean_300_id4,log_return_300_id4,log_close/mean_900_id4,log_return_900_id4,log_close/mean_60_id5,log_return_60_id5,log_close/mean_300_id5,log_return_300_id5,log_close/mean_900_id5,log_return_900_id5,log_close/mean_60_id6,log_return_60_id6,log_close/mean_300_id6,log_return_300_id6,log_close/mean_900_id6,log_return_900_id6,log_close/mean_60_id7,log_return_60_id7,log_close/mean_300_id7,log_return_300_id7,log_close/mean_900_id7,log_return_900_id7,log_close/mean_60_id8,log_return_60_id8,log_close/mean_300_id8,log_return_300_id8,log_close/mean_900_id8,log_return_900_id8,log_close/mean_60_id9,log_return_60_id9,log_close/mean_300_id9,log_return_300_id9,log_close/mean_900_id9,log_return_900_id9,log_close/mean_60_id10,log_return_60_id10,log_close/mean_300_id10,log_return_300_id10,log_close/mean_900_id10,log_return_900_id10,log_close/mean_60_id11,log_return_60_id11,log_close/mean_300_id11,log_return_300_id11,log_close/mean_900_id11,log_return_900_id11,log_close/mean_60_id12,log_return_60_id12,log_close/mean_300_id12,log_return_300_id12,log_close/mean_900_id12,log_return_900_id12,log_close/mean_60_id13,log_return_60_id13,log_close/mean_300_id13,log_return_300_id13,log_close/mean_900_id13,log_return_900_id13,mean_close/mean_60,mean_log_returns_60,log_close/mean_60-mean_close/mean_60_id0,log_return_60-mean_log_returns_60_id0,log_close/mean_60-mean_close/mean_60_id1,log_return_60-mean_log_returns_60_id1,log_close/mean_60-mean_close/mean_60_id2,log_return_60-mean_log_returns_60_id2,log_close/mean_60-mean_close/mean_60_id3,log_return_60-mean_log_returns_60_id3,log_close/mean_60-mean_close/mean_60_id4,log_return_60-mean_log_returns_60_id4,log_close/mean_60-mean_close/mean_60_id5,log_return_60-mean_log_returns_60_id5,log_close/mean_60-mean_close/mean_60_id6,log_return_60-mean_log_returns_60_id6,log_close/mean_60-mean_close/mean_60_id7,log_return_60-mean_log_returns_60_id7,log_close/mean_60-mean_close/mean_60_id8,log_return_60-mean_log_returns_60_id8,log_close/mean_60-mean_close/mean_60_id9,log_return_60-mean_log_returns_60_id9,log_close/mean_60-mean_close/mean_60_id10,log_return_60-mean_log_returns_60_id10,log_close/mean_60-mean_close/mean_60_id11,log_return_60-mean_log_returns_60_id11,log_close/mean_60-mean_close/mean_60_id12,log_return_60-mean_log_returns_60_id12,log_close/mean_60-mean_close/mean_60_id13,log_return_60-mean_log_returns_60_id13,mean_close/mean_300,mean_log_returns_300,log_close/mean_300-mean_close/mean_300_id0,log_return_300-mean_log_returns_300_id0,log_close/mean_300-mean_close/mean_300_id1,log_return_300-mean_log_returns_300_id1,log_close/mean_300-mean_close/mean_300_id2,log_return_300-mean_log_returns_300_id2,log_close/mean_300-mean_close/mean_300_id3,log_return_300-mean_log_returns_300_id3,log_close/mean_300-mean_close/mean_300_id4,log_return_300-mean_log_returns_300_id4,log_close/mean_300-mean_close/mean_300_id5,log_return_300-mean_log_returns_300_id5,log_close/mean_300-mean_close/mean_300_id6,log_return_300-mean_log_returns_300_id6,log_close/mean_300-mean_close/mean_300_id7,log_return_300-mean_log_returns_300_id7,log_close/mean_300-mean_close/mean_300_id8,log_return_300-mean_log_returns_300_id8,log_close/mean_300-mean_close/mean_300_id9,log_return_300-mean_log_returns_300_id9,log_close/mean_300-mean_close/mean_300_id10,log_return_300-mean_log_returns_300_id10,log_close/mean_300-mean_close/mean_300_id11,log_return_300-mean_log_returns_300_id11,log_close/mean_300-mean_close/mean_300_id12,log_return_300-mean_log_returns_300_id12,log_close/mean_300-mean_close/mean_300_id13,log_return_300-mean_log_returns_300_id13,mean_close/mean_900,mean_log_returns_900,log_close/mean_900-mean_close/mean_900_id0,log_return_900-mean_log_returns_900_id0,log_close/mean_900-mean_close/mean_900_id1,log_return_900-mean_log_returns_900_id1,log_close/mean_900-mean_close/mean_900_id2,log_return_900-mean_log_returns_900_id2,log_close/mean_900-mean_close/mean_900_id3,log_return_900-mean_log_returns_900_id3,log_close/mean_900-mean_close/mean_900_id4,log_return_900-mean_log_returns_900_id4,log_close/mean_900-mean_close/mean_900_id5,log_return_900-mean_log_returns_900_id5,log_close/mean_900-mean_close/mean_900_id6,log_return_900-mean_log_returns_900_id6,log_close/mean_900-mean_close/mean_900_id7,log_return_900-mean_log_returns_900_id7,log_close/mean_900-mean_close/mean_900_id8,log_return_900-mean_log_returns_900_id8,log_close/mean_900-mean_close/mean_900_id9,log_return_900-mean_log_returns_900_id9,log_close/mean_900-mean_close/mean_900_id10,log_return_900-mean_log_returns_900_id10,log_close/mean_900-mean_close/mean_900_id11,log_return_900-mean_log_returns_900_id11,log_close/mean_900-mean_close/mean_900_id12,log_return_900-mean_log_returns_900_id12,log_close/mean_900-mean_close/mean_900_id13,log_return_900-mean_log_returns_900_id13
531999,1547262000,-0.002617,-0.001702,0.007416,-0.003302,,-0.002523,0.004700,0.021652,,0.009644,,0.006718,-0.003235,-0.013145,1,-0.001841,-0.005370,-0.006983,-0.018275,-0.017957,-0.021241,0.001507,0.001015,0.000402,-0.003487,-0.002006,-0.000614,0.014521,0.022528,0.028957,0.036830,0.037031,0.048697,0.004034,-0.001386,-0.004948,-0.013961,-0.012238,-0.010191,,,,,,,0.005381,-0.001650,0.006579,0.005763,0.006556,0.005079,0.004330,0.002391,0.003209,0.003804,0.001342,0.012311,0.002080,-0.015452,-0.002702,-0.010649,-0.010945,-0.011359,0.002851,-0.003611,-0.003346,-0.019861,-0.009575,0.017917,0.001432,-0.003852,-0.002116,-0.023004,-0.016855,-0.022590,,,,,,,0.009414,0.005151,-0.005431,-0.035785,-0.013996,-0.009982,0.011762,0.011195,0.007578,0.003818,0.004442,0.005925,-0.001341,-0.021492,-0.039492,-0.069310,-0.070287,-0.070419,0.004511,-0.000878,-0.006352,-0.004492,-0.003003,0.001892,0.010011,0.023406,-0.000477,-0.000509,,,0.000870,-0.000773,-0.000181,0.003269,-0.002431,-0.014574,-0.001659,-0.002733,-0.003079,-0.002974,,,0.004903,0.006029,0.007252,0.012073,-0.005852,-0.020614,-0.001524,-0.012010,-0.005458,-0.006266,0.001927,0.008522,0.030481,0.048840,-0.003423,-0.001952,,,0.008103,0.017773,0.004734,0.015814,-0.001178,0.001361,-0.001822,-0.007851,-0.000592,-0.010994,,,-0.003906,-0.023775,0.009102,0.015828,-0.037968,-0.057300,-0.008707,-0.004706,-0.009250,-0.016536,0.006701,0.004092,0.045739,0.053403,-0.003531,-0.005486,,,0.015264,0.009785,0.010049,0.017016,-0.002238,-0.006653,-0.000868,0.022622,-0.008148,-0.017885,,,-0.005289,-0.005276,0.013149,0.010631,-0.061579,-0.065713
532000,1547262060,-0.004768,-0.001224,0.008324,0.000245,,-0.002390,0.005974,0.017776,-0.002632,0.010277,,-0.006733,-0.001293,-0.011971,1,-0.002271,-0.007055,-0.007465,-0.019510,-0.018482,-0.019678,0.001306,0.000638,0.000223,-0.003390,-0.002197,-0.000085,0.012905,0.021460,0.027573,0.037476,0.035719,0.048584,0.005520,-0.000384,-0.003426,-0.012551,-0.010755,-0.003575,,,,,,,0.005410,0.009923,0.006716,0.017034,0.006742,0.008117,0.004061,0.002743,0.002973,0.004037,0.001103,0.014562,-0.012158,-0.024095,-0.017260,-0.024394,-0.025567,-0.015560,-0.003354,-0.013641,-0.009691,-0.026294,-0.016024,0.014669,-0.005675,-0.011227,-0.009332,-0.023391,-0.024119,-0.027501,,,,,,,-0.001498,-0.008770,-0.016392,-0.029193,-0.025008,-0.041863,0.001367,0.000401,-0.002788,-0.006762,-0.005939,-0.006935,0.009349,-0.000456,-0.028615,-0.058305,-0.059540,-0.059212,0.001247,-0.002539,-0.003518,-0.004517,0.000060,0.003177,0.011658,0.023999,0.004273,0.002155,,,0.004163,0.012462,0.002814,0.005281,-0.013405,-0.021557,-0.004600,-0.011103,-0.006922,-0.008689,,,-0.002745,-0.006231,0.000120,0.002939,0.008102,0.002082,-0.004790,-0.012103,-0.002674,-0.007407,0.005014,0.008714,0.032363,0.049580,0.001364,-0.000447,,,0.011507,0.029137,0.007763,0.016140,-0.012470,-0.012291,-0.004901,-0.014190,-0.004542,-0.011288,,,-0.011602,-0.017089,0.002002,0.005341,-0.023825,-0.046201,-0.012006,-0.007373,-0.006477,-0.012305,0.009809,0.007288,0.047724,0.055957,0.001251,0.003798,,,0.018747,0.015490,0.013109,0.021935,-0.013561,-0.008187,-0.004018,0.022042,-0.012114,-0.020128,,,-0.013003,-0.034490,0.006067,0.000438,-0.047535,-0.051839
532001,1547262120,-0.006855,-0.001303,0.003559,-0.002420,,-0.002045,0.006428,0.007366,,0.001539,,0.007084,0.003309,0.010384,1,-0.001848,-0.005898,-0.007073,-0.019923,-0.018135,-0.020091,0.001171,0.000489,0.000108,-0.003594,-0.002324,-0.000259,0.011487,0.021001,0.026358,0.044164,0.034598,0.047439,0.005898,0.001747,-0.002977,-0.012465,-0.010348,0.000768,,,,,,,0.005119,0.009967,0.006533,0.017735,0.006595,0.019963,0.003831,0.002312,0.002766,0.004662,0.000897,0.013792,-0.003918,-0.002216,-0.009003,-0.016154,-0.017346,-0.016453,0.002553,-0.007866,-0.003853,-0.018905,-0.010271,0.020444,-0.005276,-0.011359,-0.009021,-0.030117,-0.023877,-0.029255,,,,,,,-0.001394,-0.006264,-0.016295,-0.029193,-0.024962,-0.041863,0.000869,-0.000516,-0.003270,-0.007413,-0.006438,-0.006678,0.008844,0.000371,-0.028882,-0.069464,-0.059975,-0.059530,0.002278,0.000147,-0.004126,-0.006045,-0.001107,0.000341,0.009209,0.020854,0.003620,0.001600,,,0.002841,0.009819,0.001553,0.002165,-0.006196,-0.002363,0.000275,-0.008014,-0.007554,-0.011506,,,-0.003672,-0.006411,-0.001409,-0.000664,0.006566,0.000224,-0.003717,-0.011722,-0.003356,-0.008201,0.003825,0.008128,0.030076,0.055886,0.000740,-0.000743,,,0.010250,0.029458,0.006484,0.016384,-0.005286,-0.004432,-0.000135,-0.007183,-0.005303,-0.018395,,,-0.012577,-0.017470,0.000447,0.004310,-0.025164,-0.057741,-0.010966,-0.005977,-0.007170,-0.014114,0.008642,0.005717,0.045564,0.053416,0.000617,0.006745,,,0.017560,0.025940,0.011862,0.019769,-0.006381,-0.010476,0.000694,0.026421,-0.012911,-0.023279,,,-0.013996,-0.035886,0.004527,-0.000701,-0.049010,-0.053553
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2122476,1642982280,,,,,,,,,,,,,,,1,0.006921,0.034809,0.036884,0.040421,0.020322,0.002127,0.002704,0.020645,0.024498,0.035992,0.021205,0.015369,0.008262,0.037881,0.038707,0.050897,0.034268,0.021220,0.006592,0.038008,0.040485,0.048811,0.018773,0.001782,0.003409,0.021912,0.026114,0.033921,0.022636,0.013850,0.005018,0.029547,0.029378,0.036039,0.020778,0.004145,0.007949,0.037774,0.041365,0.050032,0.031260,0.014515,0.007442,0.034625,0.037714,0.052706,0.031616,0.019444,0.012305,0.041800,0.034022,0.028317,0.015899,-0.013015,0.007464,0.033926,0.032393,0.048286,0.031172,0.024010,0.000509,0.025169,0.019362,0.020881,-0.004758,-0.031872,0.008437,0.030611,0.033194,0.028630,0.022313,-0.001810,0.008855,0.035231,0.035777,0.044135,0.032465,0.024772,0.000587,0.021091,0.020112,0.025808,0.011451,-0.009232,0.006175,0.031645,0.000746,0.003164,-0.003471,-0.011000,0.002087,0.006236,0.000417,0.006363,-0.002766,-0.009732,-0.001157,-0.002098,0.001774,0.006129,0.001266,0.002980,0.006130,0.010155,0.001288,0.002281,-0.005667,-0.006476,0.002262,-0.001033,0.002680,0.003586,-0.005588,-0.010554,0.032143,0.038920,0.004741,0.001501,-0.007646,-0.002928,0.006564,0.011977,0.008341,0.009891,-0.006029,-0.004998,-0.002765,-0.002880,0.009222,0.011112,0.005571,0.013786,0.001879,-0.010602,0.000250,0.009366,-0.012781,-0.018039,0.001051,-0.010289,0.003633,0.005215,-0.012031,-0.013112,0.022100,0.006093,-0.001778,-0.003966,-0.000895,0.009276,0.012168,0.015127,-0.003327,-0.004311,0.000536,0.007757,-0.001322,-0.001948,0.009160,0.008422,0.009516,0.013351,-0.006201,-0.019108,0.009072,0.017917,-0.026858,-0.037966,0.000213,-0.007903,0.010365,0.018679,-0.010649,-0.015325
2122477,1642982340,,,,,,,,,,,,,,,1,0.005845,0.033718,0.036226,0.040921,0.019805,-0.002803,0.002371,0.020801,0.024386,0.036503,0.021203,0.011812,0.008062,0.038738,0.038968,0.052579,0.034685,0.018034,0.007136,0.040852,0.041527,0.051079,0.019991,-0.001809,0.003034,0.022800,0.025996,0.035355,0.022626,0.009985,0.005457,0.029806,0.030181,0.037748,0.021708,0.000425,0.007190,0.037732,0.041055,0.050806,0.031113,0.008406,0.006172,0.036087,0.036861,0.052562,0.030925,0.013750,0.013532,0.044532,0.035880,0.030786,0.017879,-0.014863,0.007630,0.035460,0.032978,0.049830,0.031901,0.019999,0.000912,0.028980,0.020169,0.021782,-0.003840,-0.033432,0.010280,0.035733,0.035520,0.032019,0.024750,-0.002788,0.009080,0.036051,0.036442,0.045902,0.033261,0.021995,0.000430,0.022204,0.020231,0.026878,0.011674,-0.012014,0.006224,0.033107,-0.000378,0.000612,-0.003853,-0.012306,0.001838,0.005631,0.000912,0.007745,-0.003190,-0.010307,-0.000767,-0.003301,0.000967,0.004625,-0.000052,0.002980,0.007309,0.011425,0.001406,0.002354,-0.005311,-0.004126,0.004057,0.002626,0.002856,0.002944,-0.005793,-0.010902,0.032601,0.040339,0.003624,0.000582,-0.008215,-0.003837,0.006366,0.012240,0.008926,0.010739,-0.006605,-0.004984,-0.002420,-0.002591,0.008454,0.010466,0.004260,0.012223,0.003279,-0.009554,0.000376,0.009490,-0.012433,-0.018557,0.002918,-0.008320,0.003841,0.005563,-0.012370,-0.013462,0.022692,0.002621,-0.002886,-0.005424,-0.001488,0.009191,0.011994,0.015413,-0.002700,-0.004430,-0.000065,0.007364,-0.000984,-0.002196,0.008422,0.005785,0.008233,0.011129,-0.004813,-0.017485,0.009210,0.017378,-0.026531,-0.036053,0.002059,-0.005409,0.010569,0.019374,-0.011018,-0.014635
2122478,1642982400,,,,,,,,,,,,,,,1,0.002530,0.031601,0.033313,0.034475,0.017015,-0.004863,0.000519,0.020895,0.022768,0.033144,0.019683,0.011510,0.005936,0.038650,0.037324,0.045439,0.033176,0.016958,0.003233,0.038102,0.038104,0.043089,0.016718,-0.003497,-0.000219,0.021975,0.023011,0.028224,0.019727,0.008595,0.000440,0.029158,0.025549,0.028184,0.017173,-0.001926,0.004645,0.037687,0.038975,0.045821,0.029177,0.009137,0.001345,0.032388,0.032435,0.038642,0.026617,0.011092,0.008610,0.042817,0.031584,0.023413,0.013680,-0.016683,0.003873,0.033750,0.029634,0.042254,0.028679,0.018517,0.000362,0.027917,0.020015,0.018359,-0.003892,-0.034865,0.007544,0.032790,0.033236,0.026185,0.022560,-0.003634,0.004160,0.033507,0.031945,0.038236,0.028871,0.019072,-0.000672,0.021497,0.019400,0.024702,0.010938,-0.011181,0.003022,0.031624,-0.000492,-0.000023,-0.002503,-0.010729,0.002914,0.007027,0.000211,0.006478,-0.003241,-0.009649,-0.002582,-0.002466,0.001623,0.006063,-0.001677,0.000764,0.005588,0.011193,0.000851,0.002126,-0.002660,-0.003707,0.004522,0.001166,0.001139,0.001884,-0.003694,-0.010126,0.029807,0.033583,0.003507,0.000892,-0.007039,-0.000439,0.007517,0.011855,0.008297,0.009506,-0.006795,-0.005360,-0.004258,-0.005399,0.009168,0.012238,0.002629,0.005058,0.001777,-0.010170,-0.000173,0.008671,-0.009791,-0.015224,0.003429,-0.007399,0.002139,0.004653,-0.010406,-0.008881,0.020009,0.001302,-0.002994,-0.006165,-0.000326,0.010208,0.013167,0.015656,-0.003291,-0.004799,-0.000282,0.007292,-0.002836,-0.003229,0.009168,0.007835,0.006608,0.009790,-0.006328,-0.017985,0.008670,0.017214,-0.023901,-0.036167,0.002551,-0.004937,0.008862,0.017770,-0.009070,-0.012483


In [24]:
# define features for LGBM
not_use_features_train = ['timestamp', 'train_flg']
for id in range(14):
    not_use_features_train.append(f'Target_{id}')

features = feat.columns 
features = features.drop(not_use_features_train)
features = list(features)
# display(features)  
len(features)

174

In [25]:
del train_merged
del df_train
gc.collect()

8

In [26]:
# define the evaluation metric
def correlation(a, train_data):
    
    b = train_data.get_label()
    
    a = np.ravel(a)
    b = np.ravel(b)

    len_data = len(a)
    mean_a = np.sum(a) / len_data
    mean_b = np.sum(b) / len_data
    var_a = np.sum(np.square(a - mean_a)) / len_data
    var_b = np.sum(np.square(b - mean_b)) / len_data

    cov = np.sum((a * b))/len_data - mean_a*mean_b
    corr = cov / np.sqrt(var_a * var_b)

    return 'corr', corr, True

# For CV score calculation
def corr_score(pred, valid):
    len_data = len(pred)
    mean_pred = np.sum(pred) / len_data
    mean_valid = np.sum(valid) / len_data
    var_pred = np.sum(np.square(pred - mean_pred)) / len_data
    var_valid = np.sum(np.square(valid - mean_valid)) / len_data

    cov = np.sum((pred * valid))/len_data - mean_pred*mean_valid
    corr = cov / np.sqrt(var_pred * var_valid)

    return corr

# For CV score calculation
def wcorr_score(pred, valid, weight):
    len_data = len(pred)
    sum_w = np.sum(weight)
    mean_pred = np.sum(pred * weight) / sum_w
    mean_valid = np.sum(valid * weight) / sum_w
    var_pred = np.sum(weight * np.square(pred - mean_pred)) / sum_w
    var_valid = np.sum(weight * np.square(valid - mean_valid)) / sum_w

    cov = np.sum((pred * valid * weight)) / sum_w - mean_pred*mean_valid
    corr = cov / np.sqrt(var_pred * var_valid)

    return corr

In [27]:
# from: https://blog.amedama.jp/entry/lightgbm-cv-feature-importance
# (used in nyanp's Optiver solution)
def plot_importance(importances, features_names = features, PLOT_TOP_N = 20, figsize=(10, 10)):
    importance_df = pd.DataFrame(data=importances, columns=features)
    sorted_indices = importance_df.median(axis=0).sort_values(ascending=False).index
    sorted_importance_df = importance_df.loc[:, sorted_indices]
    plot_cols = sorted_importance_df.columns[:PLOT_TOP_N]
    _, ax = plt.subplots(figsize=figsize)
    ax.grid()
    ax.set_xscale('log')
    ax.set_ylabel('Feature')
    ax.set_xlabel('Importance')
    sns.boxplot(data=sorted_importance_df[plot_cols],
                orient='h',
                ax=ax)
    plt.show()

In [28]:
# from: https://www.kaggle.com/code/nrcjea001/lgbm-embargocv-weightedpearson-lagtarget/
def get_time_series_cross_val_splits(data, cv = n_fold, embargo = 3750):
    all_train_timestamps = data['timestamp'].unique()
    len_split = len(all_train_timestamps) // cv
    test_splits = [all_train_timestamps[i * len_split:(i + 1) * len_split] for i in range(cv)]
    # fix the last test split to have all the last timestamps, in case the number of timestamps wasn't divisible by cv
    rem = len(all_train_timestamps) - len_split*cv
    if rem>0:
        test_splits[-1] = np.append(test_splits[-1], all_train_timestamps[-rem:])

    train_splits = []
    for test_split in test_splits:
        test_split_max = int(np.max(test_split))
        test_split_min = int(np.min(test_split))
        # get all of the timestamps that aren't in the test split
        train_split_not_embargoed = [e for e in all_train_timestamps if not (test_split_min <= int(e) <= test_split_max)]
        # embargo the train split so we have no leakage. Note timestamps are expressed in seconds, so multiply by 60
        embargo_sec = 60*embargo
        train_split = [e for e in train_split_not_embargoed if
                       abs(int(e) - test_split_max) > embargo_sec and abs(int(e) - test_split_min) > embargo_sec]
        train_splits.append(train_split)

    # convenient way to iterate over train and test splits
    train_test_zip = zip(train_splits, test_splits)
    return train_test_zip

In [29]:
def get_Xy_and_model_for_asset(df_proc, asset_id):
    df_proc = df_proc.loc[  (df_proc[f'Target_{asset_id}'] == df_proc[f'Target_{asset_id}'])  ]
    if not_use_overlap_to_train:
        df_proc = df_proc.loc[  (df_proc['train_flg'] == 1)  ]
    
# EmbargoCV
    train_test_zip = get_time_series_cross_val_splits(df_proc, cv = n_fold, embargo = 3750)
    print("entering time series cross validation loop")
    importances = []
    oof_pred = []
    oof_valid = []
    
    for split, train_test_split in enumerate(train_test_zip):
        gc.collect()
        
        print(f"doing split {split+1} out of {n_fold}")
        train_split, test_split = train_test_split
        train_split_index = df_proc['timestamp'].isin(train_split)
        test_split_index = df_proc['timestamp'].isin(test_split)
    
        train_dataset = lgb.Dataset(df_proc.loc[train_split_index, features],
                                    df_proc.loc[train_split_index, f'Target_{asset_id}'].values, 
                                    feature_name = features, 
                                   )
        val_dataset = lgb.Dataset(df_proc.loc[test_split_index, features], 
                                  df_proc.loc[test_split_index, f'Target_{asset_id}'].values, 
                                  feature_name = features, 
                                 )

        print(f"number of train data: {len(df_proc.loc[train_split_index])}")
        print(f"number of val data:   {len(df_proc.loc[test_split_index])}")

        model = lgb.train(params = params,
                          train_set = train_dataset, 
                          valid_sets=[train_dataset, val_dataset],
                          valid_names=['tr', 'vl'],
                          num_boost_round = 5000,
                          verbose_eval = 100,     
                          feval = correlation,
                         )
        importances.append(model.feature_importance(importance_type='gain'))
        
        file = f'trained_model_id{asset_id}_fold{split}.pkl'
        pickle.dump(model, open(file, 'wb'))
        print(f"Trained model was saved to 'trained_model_id{asset_id}_fold{split}.pkl'")
        print("")
            
        oof_pred += list(  model.predict(df_proc.loc[test_split_index, features])        )
        oof_valid += list(   df_proc.loc[test_split_index, f'Target_{asset_id}'].values    )
    
    
    plot_importance(np.array(importances),features, PLOT_TOP_N = 20, figsize=(10, 5))

    return oof_pred, oof_valid

In [32]:
oof = [ [] for id in range(14)   ]

all_oof_pred = []
all_oof_valid = []
all_oof_weight = []

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    
    oof_pred, oof_valid = get_Xy_and_model_for_asset(feat, asset_id)
    
    weight_temp = float( df_asset_details.loc[  df_asset_details['Asset_ID'] == asset_id  , 'Weight'   ]  )
    
    all_oof_pred += oof_pred
    all_oof_valid += oof_valid
    all_oof_weight += [weight_temp] * len(oof_pred)
    
    oof[asset_id] = corr_score(     np.array(oof_pred)   ,    np.array(oof_valid)    )
    
    print(f'OOF corr score of {asset_name} (ID={asset_id}) is {oof[asset_id]:.5f}. (Weight: {float(weight_temp):.5f})')
    print('')
    print('')

Training model for Binance Coin     (ID=0 )
entering time series cross validation loop
doing split 1 out of 7
number of train data: 1356894
number of val data:   226773


TypeError: train() got an unexpected keyword argument 'verbose_eval'

In [31]:
woof = 0
for id in range(14):
    woof += oof[id] * float(  df_asset_details.loc[  df_asset_details['Asset_ID'] == id  , 'Weight'   ] )
woof = woof / df_asset_details['Weight'].sum()

print(f'OOF corr scores are;')
for oof_score in oof:
    print(f'      {oof_score:.5f}')
print(f'  simple average corr score: {np.mean(oof):.5f}.')
print(f'weighted average corr score: {woof:.5f}.')
print(f'')

all_oof_wcorr = wcorr_score(     np.array(all_oof_pred),    np.array(all_oof_valid),  np.array(all_oof_weight)   )
print(f'        weighted corr score: {all_oof_wcorr:.5f}.')

TypeError: can't multiply sequence by non-int of type 'float'