In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
import os, gc
import random
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from lightgbm import LGBMRegressor
from sklearn.preprocessing import KBinsDiscretizer

import gresearch_crypto

import pickle

def pickle_dump(obj, path):
    with open(path, mode="wb") as f:
        pickle.dump(obj,f)

def pickle_load(path):
    with open(path, mode="rb") as f:
        data = pickle.load(f)
        return data

TRAIN_CSV = '/kaggle/input/g-research-crypto-forecasting/train.csv'
ASSET_DETAILS_CSV = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'

SEED = 2021

REMOVE_LB_TEST_OVERLAPPING_DATA = True
TRAIN_FLAG = True

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#         else:
#             df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
if TRAIN_FLAG:
    df_train = pd.read_csv(TRAIN_CSV)
    df_train = df_train.drop(["Count", "Open", "Volume", "VWAP"], axis=1)
    #df_train = reduce_mem_usage(df_train)
    df_train.reset_index(inplace=True, drop=True)
    df_train.head()

if TRAIN_FLAG and REMOVE_LB_TEST_OVERLAPPING_DATA:
    df_train['datetime'] = pd.to_datetime(df_train['timestamp'], unit='s')
    df_test = df_train[df_train['datetime'] >= '2021-06-13 00:00:00']
    df_train = df_train[df_train['datetime'] < '2021-06-13 00:00:00']
    df_train = df_train[df_train['datetime'] >= '2020-01-01 00:00:00']
    df_train.reset_index(inplace=True, drop=True)
    df_test.reset_index(inplace=True, drop=True)
    
    gc.collect()

In [5]:
def get_feature(tmp_df, TARGET=True):
    tmp_df["ror1"]         = tmp_df.groupby("Asset_ID")["Close"].pct_change(1)
    tmp_df["ror1_shift15"] = tmp_df.groupby("Asset_ID")["ror1"].shift(15)
    tmp_df["ror1_shift30"] = tmp_df.groupby("Asset_ID")["ror1"].shift(30)
    tmp_df["ror1_shift60"] = tmp_df.groupby("Asset_ID")["ror1"].shift(60)
    tmp_df["ror4"] = tmp_df.groupby("Asset_ID")["Close"].pct_change(4)
    tmp_df["ror5"] = tmp_df.groupby("Asset_ID")["ror4"].shift(1)
    tmp_df["ror10"] = tmp_df.groupby("Asset_ID")["Close"].pct_change(10)
    tmp_df["ror15"] = tmp_df.groupby("Asset_ID")["ror10"].shift(5)
    tmp_df["ror15_raw"] = tmp_df.groupby("Asset_ID")["Close"].pct_change(15)
    tmp_df["ror15_shift60"] = tmp_df.groupby("Asset_ID")["ror15_raw"].shift(60)
    tmp_df["ror15_shift240"] = tmp_df.groupby("Asset_ID")["ror15_raw"].shift(240)
    tmp_df["ror15_shift720"] = tmp_df.groupby("Asset_ID")["ror15_raw"].shift(720)
    tmp_df["ror15_shift1080"] = tmp_df.groupby("Asset_ID")["ror15_raw"].shift(1080)
    tmp_df["ror15_shift1440"] = tmp_df.groupby("Asset_ID")["ror15_raw"].shift(1440)
    tmp_df["ror45"] = tmp_df.groupby("Asset_ID")["Close"].pct_change(45)
    tmp_df["ror60"] = tmp_df.groupby("Asset_ID")["ror45"].shift(15)
    tmp_df["ror60_raw"] = tmp_df.groupby("Asset_ID")["Close"].pct_change(60)
    tmp_df["ror60_shift240"] = tmp_df.groupby("Asset_ID")["ror60_raw"].shift(240)
    tmp_df["ror60_shift720"] = tmp_df.groupby("Asset_ID")["ror60_raw"].shift(720)
    tmp_df["ror60_shift1080"] = tmp_df.groupby("Asset_ID")["ror60_raw"].shift(1080)
    tmp_df["ror60_shift1440"] = tmp_df.groupby("Asset_ID")["ror60_raw"].shift(1440)
    tmp_df["ror180"] = tmp_df.groupby("Asset_ID")["Close"].pct_change(180)
    tmp_df["ror240"] = tmp_df.groupby("Asset_ID")["ror180"].shift(60)
    tmp_df["ror240_raw"] = tmp_df.groupby("Asset_ID")["Close"].pct_change(240)
    tmp_df["ror240_shift720"] = tmp_df.groupby("Asset_ID")["ror240_raw"].shift(720)
    tmp_df["ror240_shift1080"] = tmp_df.groupby("Asset_ID")["ror240_raw"].shift(1080)
    tmp_df["ror240_shift1440"] = tmp_df.groupby("Asset_ID")["ror240_raw"].shift(1440)
    tmp_df["ror480"] = tmp_df.groupby("Asset_ID")["Close"].pct_change(480)
    tmp_df["ror720"] = tmp_df.groupby("Asset_ID")["ror480"].shift(240)
    tmp_df["ror720_raw"] = tmp_df.groupby("Asset_ID")["Close"].pct_change(720)
    tmp_df["ror1440"] = tmp_df.groupby("Asset_ID")["ror720_raw"].shift(720)
    tmp_df["hh15"] = tmp_df.groupby("Asset_ID")["High"].rolling(15).max().reset_index().sort_values("level_1").set_index("level_1")["High"]
    tmp_df["hh60"] = tmp_df.groupby("Asset_ID")["High"].rolling(60).max().reset_index().sort_values("level_1").set_index("level_1")["High"]
    tmp_df["hh240"] = tmp_df.groupby("Asset_ID")["High"].rolling(240).max().reset_index().sort_values("level_1").set_index("level_1")["High"]
    tmp_df["hh1440"] = tmp_df.groupby("Asset_ID")["High"].rolling(1440).max().reset_index().sort_values("level_1").set_index("level_1")["High"]
    tmp_df["ll15"] = tmp_df.groupby("Asset_ID")["Low"].rolling(15).min().reset_index().sort_values("level_1").set_index("level_1")["Low"]
    tmp_df["ll60"] = tmp_df.groupby("Asset_ID")["Low"].rolling(60).min().reset_index().sort_values("level_1").set_index("level_1")["Low"]
    tmp_df["ll240"] = tmp_df.groupby("Asset_ID")["Low"].rolling(240).min().reset_index().sort_values("level_1").set_index("level_1")["Low"]
    tmp_df["ll1440"] = tmp_df.groupby("Asset_ID")["Low"].rolling(1440).min().reset_index().sort_values("level_1").set_index("level_1")["Low"]
    tmp_df["dip15"] = tmp_df["Close"]/tmp_df["hh15"] - 1
    tmp_df["dip60"] = tmp_df["Close"]/tmp_df["hh60"] - 1
    tmp_df["dip240"] = tmp_df["Close"]/tmp_df["hh240"] - 1
    tmp_df["dip1440"] = tmp_df["Close"]/tmp_df["hh1440"] - 1
    tmp_df["rip15"] = tmp_df["Close"]/tmp_df["ll15"] - 1
    tmp_df["rip60"] = tmp_df["Close"]/tmp_df["ll60"] - 1
    tmp_df["rip240"] = tmp_df["Close"]/tmp_df["ll240"] - 1
    tmp_df["rip1440"] = tmp_df["Close"]/tmp_df["ll1440"] - 1
    
    x_feats = ["ror1", "ror1_shift15", "ror1_shift30", "ror1_shift60", "ror4", "ror5", "ror10", "ror15", "ror15_raw",
               "ror15_shift60", "ror15_shift240", "ror15_shift720", "ror15_shift1080", "ror15_shift1440", "ror45",
               "ror60", "ror60_raw", "ror60_shift240", "ror60_shift720", "ror60_shift1080", "ror60_shift1440",
               "ror180", "ror240", "ror240_raw", "ror240_shift720", "ror240_shift1080", "ror240_shift1440",
               "ror480", "ror720", "ror720_raw", "ror1440",
               "dip15", "dip60", "dip240", "dip1440", "rip15", "rip60", "rip240", "rip1440"]
    
    if TARGET:
        feats = ["Target"]
    else:
        feats = []
    feats.extend(["timestamp", "Asset_ID"])
    feats.extend(x_feats)
    
    return tmp_df[feats]

In [6]:
def get_dataframe_demean(df, TARGET=True, TRAIN=True):
    # 特徴量
    df = get_feature(df, TARGET)
    if TRAIN:
        gc.collect()
    
    x_feats = ["ror1", "ror1_shift15", "ror1_shift30", "ror1_shift60", "ror4", "ror5", "ror10", "ror15", "ror15_raw",
               "ror15_shift60", "ror15_shift240", "ror15_shift720", "ror15_shift1080", "ror15_shift1440", "ror45",
               "ror60", "ror60_raw", "ror60_shift240", "ror60_shift720", "ror60_shift1080", "ror60_shift1440",
               "ror180", "ror240", "ror240_raw", "ror240_shift720", "ror240_shift1080", "ror240_shift1440",
               "ror480", "ror720", "ror720_raw", "ror1440",
               "dip15", "dip60", "dip240", "dip1440", "rip15", "rip60", "rip240", "rip1440"]
    
    # マーケットリターンを計算（単純平均）
    tmp = df.groupby("timestamp").mean()
    tmp = tmp[x_feats]
    tmp = tmp.add_suffix("_market")
    tmp.reset_index(inplace=True)
    
    if TRAIN:
        gc.collect()
    
    # マーケット項を追加
    for f in x_feats:
        df = df.merge(tmp[["timestamp", f+"_market"]], on="timestamp", how="left")
        df[f] = df[f] - df[f+"_market"]
        df = df.drop([f+"_market"], axis=1)
        gc.collect()
    
    if TRAIN:
        del tmp
        gc.collect()
    
    # ビニング
    if TRAIN:
        df = df.dropna()
        for feat in x_feats:
            bins[feat] = KBinsDiscretizer(n_bins=10, encode="ordinal")
            bins[feat].fit(df[[feat]])
            df[feat] = bins[feat].transform(df[[feat]])
            #pickle_dump(bins[feat], f"../input/gresearch/bins_model/bins_{feat}.pickle")
    else:
        df = df.replace([-np.inf, np.inf], np.nan).fillna(0)
        for feat in x_feats:
            df[feat] = bins[feat].transform(df[[feat]])
    
    if TRAIN:
        gc.collect()
    
    # 銘柄数がアンマッチなtimestampを削除
    tmp = df.groupby("timestamp").count()
    reject_timestamp = [t for t in tmp[tmp["Asset_ID"]!=14].index]
    df = df[~df["timestamp"].isin(reject_timestamp)]
    
    if TRAIN:
        gc.collect()
    
    # ピボット
    df.set_index("timestamp", inplace=True)
    df2 = df[df["Asset_ID"]==1].copy()  # Bitcoin
    df2 = df2.drop(["Asset_ID"], axis=1)
    df2 = df2.add_suffix("_1")
    for i in range(14):
        if i == 1:
            continue
        tmp_df = df[df["Asset_ID"]==i].copy()
        tmp_df = tmp_df.drop(["Asset_ID"], axis=1)
        tmp_df = tmp_df.add_suffix("_"+str(i))
        df2 = df2.join(tmp_df)
    
    df2.reset_index(inplace=True)
    df = df2.copy()
    
    if TRAIN:
        del df2, tmp_df
        gc.collect()
    
    # 時刻特徴量
    df["datetime"] = df["timestamp"].apply(lambda x: datetime.datetime.fromtimestamp(x))
    df["hour"]   = df["datetime"].dt.hour
    df["minute"] = df["datetime"].dt.minute
    
    df["isFunding"] = df["hour"]%4
    df["is00"] = 0
    df["is05"] = 0
    df["is15"] = 0
    df["is30"] = 0
    
    df.loc[df["minute"]==0, "is00"] = 1
    df.loc[df["minute"].isin([5, 10, 20, 25, 35, 40, 50, 55]), "is05"] = 1
    df.loc[df["minute"].isin([15, 45]), "is15"] = 1
    df.loc[df["minute"]==30, "is30"] = 1
    
    if TRAIN:
        gc.collect()
    return df

In [7]:
# 訓練
bins = {}
models = {}

if TRAIN_FLAG:
    df_train = get_dataframe_demean(df_train, TARGET=True, TRAIN=True)
    df_train = df_train.drop(["datetime"], axis=1)
    #df_train = reduce_mem_usage(df_train)
    gc.collect()

In [8]:
targets = [f for f in df_train.columns if "Target" in f]
x_feats = [f for f in df_train.columns if f not in ["timestamp", "datetime", "hour", "minute"]]
x_feats = [f for f in x_feats if f not in targets]

if TRAIN_FLAG:
    for i in tqdm(range(14)):
        models[i] = LGBMRegressor(max_depth=4, learning_rate=0.01, num_leaves=20, n_estimators=1000, n_jobs=-1, colsample_bytree=0.1)
        models[i].fit(df_train[x_feats], df_train[f"Target_{i}"])
        #pickle_dump(models[i], f"../input/gresearch/main_model/model_{i}.pickle")
        gc.collect()

  0%|          | 0/14 [00:00<?, ?it/s]

In [9]:
if TRAIN_FLAG:
    df_test = get_dataframe_demean(df_test, TARGET=True, TRAIN=False)
    gc.collect()
    for i in tqdm(range(14)):
        df_test[f"pred_{i}"] = models[i].predict(df_test[x_feats])

  0%|          | 0/14 [00:00<?, ?it/s]

In [10]:
if TRAIN_FLAG:
    sm = 0
    for i in range(14):
        sm += df_test[f"pred_{i}"].corr(df_test[f"Target_{i}"])
        print(df_test[f"pred_{i}"].corr(df_test[f"Target_{i}"]))
    
    sm = sm/14
    print(f"EW_AVE: {sm}")

0.02985266274777802
0.03309403661095809
0.04477235921017148
0.04603483639849692
0.03333368205596368
0.04952550138354682
0.038096000512367986
0.0705357654175642
0.016830799951636485
0.04407021950411849
0.03440511092918131
0.03496123897145396
0.08849669777018336
0.05267744970550733
EW_AVE: 0.04404902579778058


In [11]:
def get_dataframe_demean_for_loop(tmp_df):
    assets = [f for f in tmp_df.loc[tmp_df["timestamp"]==tmp_df["timestamp"].max(), "Asset_ID"]]
    x_dict = {}
    
    feats = ["ror1", "ror1_shift15", "ror1_shift30", "ror1_shift60", "ror4", "ror5", "ror10", "ror15", "ror15_raw",
             "ror15_shift60", "ror15_shift240", "ror15_shift720", "ror15_shift1080", "ror15_shift1440", "ror45",
             "ror60", "ror60_raw", "ror60_shift240", "ror60_shift720", "ror60_shift1080", "ror60_shift1440",
             "ror180", "ror240", "ror240_raw", "ror240_shift720", "ror240_shift1080", "ror240_shift1440",
             "ror480", "ror720", "ror720_raw", "ror1440",
             "dip15", "dip60", "dip240", "dip1440", "rip15", "rip60", "rip240", "rip1440"]
    
    params = [[1, 0], [1, 15], [1, 30], [1, 60], [4, 0], [4, 1], [10, 0], [10, 5], [15, 0],
              [15, 60], [15, 240], [15, 720], [15, 1080], [15, 1440], [45, 0],
              [45, 15], [60, 0], [60, 240], [60, 720], [60, 1080], [60, 1440],
              [180, 0], [180, 60], [240, 0], [240, 720], [240, 1080], [240, 1440],
              [480, 0], [480, 240], [720, 0], [720, 720]]
    params2 = [15, 60, 240, 1440]
    
    # 特徴量作成
    for i in range(14):
        if i not in assets:
            for f in feats:
                x_dict[f"{f}_{i}"] = np.nan
            continue
        tmp_close = np.array(tmp_df.loc[tmp_df["Asset_ID"]==i, "Close"])
        tmp_high  = np.array(tmp_df.loc[tmp_df["Asset_ID"]==i, "High"])
        tmp_low   = np.array(tmp_df.loc[tmp_df["Asset_ID"]==i, "Low"])
        for j in range(len(params)):
            if len(tmp_close)>=(1+params[j][0]+params[j][1]):
                x_dict[f"{feats[j]}_{i}"] = tmp_close[-1-params[j][1]]/tmp_close[-1-params[j][0]-params[j][1]] - 1
            else:
                x_dict[f"{feats[j]}_{i}"] = np.nan
        for j in params2:
            if len(tmp_high)>=j:
                hh = tmp_high[-j:].max()
                ll = tmp_low[-j:].min()
                x_dict[f"dip{j}_{i}"] = tmp_close[-1]/hh - 1
                x_dict[f"rip{j}_{i}"] = tmp_close[-1]/ll - 1
            else:
                x_dict[f"dip{j}_{i}"] = np.nan
                x_dict[f"rip{j}_{i}"] = np.nan
    
    # マーケットを控除
    for f in feats:
        s = 0
        n = 0
        for i in range(14):
            if ~np.isnan(x_dict[f"{f}_{i}"]):
                s += x_dict[f"{f}_{i}"]
                n += 1
        if n == 0:
            m = 0
        else:
            m = s/n
        for i in range(14):
            if ~np.isnan(x_dict[f"{f}_{i}"]):
                x_dict[f"{f}_{i}"] -= m
    
    # ビニング
    for f in feats:
        for i in range(14):
            if ~np.isnan(x_dict[f"{f}_{i}"]):
                x_dict[f"{f}_{i}"] = bins[f].transform(np.array(x_dict[f"{f}_{i}"]).reshape(-1, 1))[0][0]
    
    # 時刻特徴量
    dt = datetime.datetime.fromtimestamp(tmp_df["timestamp"].max())
    minute = dt.minute
    
    x_dict["isFunding"] = dt.hour%4
    x_dict["is00"] = 0
    x_dict["is05"] = 0
    x_dict["is15"] = 0
    x_dict["is30"] = 0
    
    if minute == 0:
        x_dict["is00"] = 1
    elif minute==30:
        x_dict["is30"] = 1
    elif minute in [15, 45]:
        x_dict["is15"] = 1
    elif minute in [5, 10, 20, 25, 35, 40, 50, 55]:
        x_dict["is05"] = 1
    return x_dict

In [12]:
# check train and loop
#targets = [f for f in df_train.columns if "Target" in f]
#x_feats = [f for f in df_train.columns if f not in ["timestamp", "datetime", "hour", "minute"]]
#x_feats = [f for f in x_feats if f not in targets]
#
#if TRAIN_FLAG:
#    df_valid = pd.read_csv(TRAIN_CSV)
#    df_valid = df_valid.drop(["Count", "Open", "Volume", "VWAP"], axis=1)
#    df_valid['datetime'] = pd.to_datetime(df_valid['timestamp'], unit='s')
#    df_valid = df_valid[df_valid['datetime'] < '2021-06-13 00:00:00']
#    df_valid = df_valid[df_valid['datetime'] >= '2020-01-01 00:00:00']
#    df_valid.reset_index(inplace=True, drop=True)
#    
#    gc.collect()
#
#def compareResult(timestamp=1597670760):
#    tmp_index = df_valid[df_valid["timestamp"]==timestamp].index.max()
#    tmp_df = df_valid.iloc[(tmp_index-14*1440-14*240-2000):(tmp_index+1)]
#    tmp_dict = get_dataframe_demean_for_loop(tmp_df)
#    res = pd.DataFrame({"train":np.array(df_train.loc[df_train["timestamp"]==timestamp, x_feats]).reshape(-1), "loop":pd.Series(tmp_dict)[x_feats]})
#    return res
#
#res = compareResult(timestamp=1597670760)
#res[res["train"]!=res["loop"]]

In [13]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

history = pd.DataFrame()

for i, (df_test, df_pred) in enumerate(iter_test):
    history = pd.concat([history, df_test])
    tmp = history.copy()
    tmp.reset_index(inplace=True, drop=True)
    tmp_dict = get_dataframe_demean_for_loop(tmp)
    for j, row in df_test.iterrows():
        y_pred = models[row['Asset_ID']].predict(np.array(pd.Series(tmp_dict)[x_feats]).reshape(1, -1))[0]
        df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
    
    history = history.sort_values("timestamp")
    history = history.iloc[(-14*1440-14*240-2000):]
    
    env.predict(df_pred)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
