Taken from https://www.kaggle.com/code/sugghi/training-3rd-place-solution

# Setup

In [1]:
%reload_ext autoreload
%autoreload 2

In [13]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import time
import datetime
from alpaca.data.historical import CryptoHistoricalDataClient
from alpaca.data.requests import CryptoBarsRequest
from alpaca.data.timeframe import TimeFrame
from datetime import datetime
import config

from feature_engineering import *
from training import *

import pickle
import gc

from tqdm import tqdm

n_fold = 7
seed0 = 8586
use_supple_for_train = True

# If True, the period used to evaluate Public LB will not be used for training.
# Set to False on final submission.
not_use_overlap_to_train = False

pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 350)

lags = [60,300,900]

In [3]:
params = {
    'early_stopping_rounds': 50,
    'objective': 'regression',
    'metric': 'rmse',
#     'metric': 'None',
    'boosting_type': 'gbdt',
    'max_depth': 5,
    'verbose': -1,
    'max_bin':600,
    'min_data_in_leaf':50,
    'learning_rate': 0.03,
    'subsample': 0.7,
    'subsample_freq': 1,
    'feature_fraction': 1,
    'lambda_l1': 0.5,
    'lambda_l2': 2,
    'seed':seed0,
    'feature_fraction_seed': seed0,
    'bagging_fraction_seed': seed0,
    'drop_seed': seed0,
    'data_random_seed': seed0,
    'extra_trees': True,
    'extra_seed': seed0,
    'zero_as_missing': True,
    "first_metric_only": True
         }

# Data Processing

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [5]:
# no keys required for crypto data
client = CryptoHistoricalDataClient()

symbol = "BTC/USD"
request_params = CryptoBarsRequest(
    # symbol_or_symbols=["BTC/USD", "ETH/USD"],
    symbol_or_symbols=symbol,
    timeframe=TimeFrame.Hour,
    start=datetime.strptime("2023-03-01", '%Y-%m-%d')
)

bars = client.get_crypto_bars(request_params)

In [6]:
# convert to dataframe
btc = bars.df.loc[symbol]
btc = btc[['close']].copy()
btc.fillna(method='ffill', limit=100)
btc = reduce_mem_usage(btc)

df_train, df_val, df_test = (
    btc.loc[:config.TRAIN_END_DATE].copy(), 
    btc.loc[config.VAL_START_DATE:config.VAL_END_DATE].copy(),
    btc.loc[config.TEST_START_DATE:].copy()
)

Memory usage of dataframe is 0.10 MB
Memory usage after optimization is: 0.06 MB
Decreased by 37.5%


# Feature Engineering

In [7]:
feat = get_features(df_train)
feat

Unnamed: 0_level_0,close,log_close/mean_60,log_return_60,mean_close/mean_60,mean_log_returns_60,log_close/mean_60-mean_close/mean_60,log_return_60-mean_log_returns_60,log_close/mean_300,log_return_300,mean_close/mean_300,mean_log_returns_300,log_close/mean_300-mean_close/mean_300,log_return_300-mean_log_returns_300,log_close/mean_900,log_return_900,mean_close/mean_900,mean_log_returns_900,log_close/mean_900-mean_close/mean_900,log_return_900-mean_log_returns_900
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2023-03-01 00:00:00+00:00,23104.0,10.047761,-0.119141,0.135274,-6.556511e-07,9.912487,-0.119141,10.047761,-0.114197,0.683295,-5.960464e-07,9.364466,-0.114197,10.047761,-0.231323,2.066016,2.384186e-07,7.981745,-0.231323
2023-03-01 01:00:00+00:00,23248.0,10.053974,-0.113098,0.135274,-6.556511e-07,9.918700,-0.113098,10.053974,-0.108704,0.683295,-5.960464e-07,9.370679,-0.108704,10.053974,-0.230713,2.066016,2.384186e-07,7.987959,-0.230713
2023-03-01 02:00:00+00:00,23328.0,10.057410,-0.164673,0.135274,-6.556511e-07,9.922136,-0.164673,10.057410,-0.107117,0.683295,-5.960464e-07,9.374114,-0.107117,10.057410,-0.223999,2.066016,2.384186e-07,7.991394,-0.223999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-31 21:00:00+00:00,26016.0,-0.043092,0.001951,0.135274,-6.556511e-07,-0.178367,0.001952,-0.010689,0.002926,0.683295,-5.960464e-07,-0.693984,0.002926,-0.078528,-0.114746,2.066016,2.384186e-07,-2.144544,-0.114746
2023-08-31 22:00:00+00:00,26032.0,-0.042517,0.002926,0.135274,-6.556511e-07,-0.177791,0.002926,-0.010090,0.004871,0.683295,-5.960464e-07,-0.693386,0.004871,-0.077790,-0.113647,2.066016,2.384186e-07,-2.143805,-0.113647
2023-08-31 23:00:00+00:00,25920.0,-0.046799,-0.001955,0.135274,-6.556511e-07,-0.182073,-0.001955,-0.014404,0.000976,0.683295,-5.960464e-07,-0.697699,0.000977,-0.081974,-0.116943,2.066016,2.384186e-07,-2.147990,-0.116943


In [10]:
features = feat.columns.tolist()
# TODO - he has 174 total... but 19 x 14 = 266
len(features)

19

# Training

In [None]:
oof = [ [] for id in range(14)   ]

all_oof_pred = []
all_oof_valid = []
all_oof_weight = []


print(f"Training model for BTC")

oof_pred, oof_valid = get_Xy_and_model_for_asset(feat, asset_id)

weight_temp = float( df_asset_details.loc[  df_asset_details['Asset_ID'] == asset_id  , 'Weight'   ]  )

all_oof_pred += oof_pred
all_oof_valid += oof_valid
all_oof_weight += [weight_temp] * len(oof_pred)

oof[asset_id] = corr_score(     np.array(oof_pred)   ,    np.array(oof_valid)    )

print(f'OOF corr score of {asset_name} (ID={asset_id}) is {oof[asset_id]:.5f}. (Weight: {float(weight_temp):.5f})')
print('')
print('')