In [1]:
%load_ext autoreload
%autoreload

In [2]:
import sys
sys.path.append("../")

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
from datetime import datetime
import matplotlib.pyplot as plt
from src.evaluation import *
from src.utils import load_fold, assets, non_train_cols, load_gresearch_raw
from src.features import engineer_all_features
from src.models import BaseCryptoLearner
plt.rcParams["figure.figsize"] = (16,9)
from tqdm import tqdm

In [5]:
from scipy import stats
import shap
import lightgbm as lgb

In [6]:
train = pd.read_csv('../data/gresearch/raw/train.csv')

In [47]:
train['timestamp_1'] = pd.to_datetime(train.timestamp, unit='s')
data = train[(train.timestamp_1 > pd.to_datetime('2021-06-11 00:00:00')) &
             (train.timestamp_1 <= pd.to_datetime('2021-06-13 00:00:00'))].copy()
full_test_df = train[(train.timestamp_1 > pd.to_datetime('2021-06-13 00:00:00'))].copy()

# Get test timestamps -> This is what we loop over
test_timestamps = np.arange(full_test_df.timestamp.min(), full_test_df.timestamp.max() + 60, 60)

In [48]:
full_test_df['row_id'] = np.arange(0,full_test_df.shape[0],1)

In [49]:
full_test_df['Target'] = 0

In [8]:
asset_info = pd.read_csv('../data/gresearch/raw/asset_details.csv')

In [9]:
bst = lgb.Booster(model_file='../models/lgbmBaseline.txt')

# Fast Feature Calculation 

- Mock submission by looping through the data from submission period

In [10]:
# OHLCV Features
def fast_ohlcv_features(df, out_features):
    
    out_features[0] = df['Count']
    out_features[1] = df['Open']
    out_features[2] = df['High']
    out_features[3] = df['Low']
    out_features[4] = df['Close']
    out_features[5] = df['Volume']
    out_features[6] = df['VWAP']
    out_features[7] = (df.Close - df.Open) / df.Open
    out_features[8] = (np.log(df.Close / df.Open))
    out_features[9] = (df.High / df.Low)
    
    return out_features

# Relative change features
def fast_relative_features(feature_array, columns, period, out_features):
    
    for j in columns:
        
        value = np.log(feature_array[0, j] /
                       feature_array[-period, j])
        i = np.min(np.where(np.isnan(current_features)))
        out_features[i]= value
    
    return out_features

def fast_lagged_features(feature_array, columns, period, out_features):
    
    for j in columns:
        value = feature_array[-period, j]
        i = np.min(np.where(np.isnan(current_features)))
        out_features[i]= value
        
    return out_features

In [11]:
# From engineer_all_features
relative_cols = ['Count', 'Open',
                 'High', 'Low', 'Close',
                 'Volume', 'VWAP']

lagged_cols = ['direct_return', 'log_return', 'high_low_ratio',
                   'log_change_Count_1min', 'log_change_Open_1min',
                   'log_change_High_1min', 'log_change_Low_1min',
                   'log_change_Close_1min',
                   'log_change_Volume_1min', 'log_change_VWAP_1min']


relative_periods = [1, 60]
lagged_periods = [1, 2, 3, 4, 5]
feature_names = bst.feature_name()

In [69]:
%%time

# Constants
max_lag = 5
max_relative = 60
max_lookback_minutes = max_relative + max_lag

max_lookback = data.timestamp.max() - (max_lookback_minutes * 60)
current_data = data[data.timestamp>=max_lookback]
print(current_data.shape)

# For handling missing timestamps
timestamp_series = pd.DataFrame(index=np.sort(current_data.timestamp.unique()))



# Initialize the feature array using pandas
features = []
for asset_id in assets:
    asset = current_data[current_data.Asset_ID == asset_id]
    if not np.all(timestamp_series.index.isin(asset.timestamp)):
        # Backfill with timestamp
        asset = timestamp_series.merge(asset, left_index=True, right_on='timestamp', how='left').fillna(method='ffill', axis=0)
    asset_features = engineer_all_features(asset).iloc[:,2:]
    features.append(asset_features.values)
features = np.array(features)
original_shape = features.shape

(923, 11)
CPU times: user 340 ms, sys: 8.74 ms, total: 349 ms
Wall time: 356 ms


To DO: Keep track of the timestamps and backfill

In [71]:
# Empty initialization period 
features = np.repeat(np.nan, 14 * 66 * 74).reshape(14,66,74)

In [None]:
submissions = []

# Outer loop over timestamps
for ts in tqdm(test_timestamps):

    test_df = full_test_df[full_test_df.timestamp==ts]
    prediction_df = full_test_df[full_test_df.timestamp==ts][['row_id', 'Target']]
    
    new_features = []
    
    for asset_id in assets:
        
        # Check if asset in test_df, if not forward fill 
        if not asset_id in test_df.Asset_ID.values:
            new_features.append(features[asset_id][-1].reshape(1,-1))
            continue
        
        asset = test_df[test_df.Asset_ID == asset_id].iloc[0]
        
        row_id = asset['row_id']
        
        asset_features = features[asset_id]
        
        current_features = np.repeat(np.nan, repeats=features.shape[2])

        current_features = fast_ohlcv_features(asset, current_features)

        # Relative change features
        for period in relative_periods:
            current_features = fast_relative_features(asset_features,
                                                      columns=np.where(np.isin(feature_names,relative_cols))[0],
                                                      period=period,
                                                      out_features=current_features)

        # Lagged features
        for period in lagged_periods:
            current_features = fast_lagged_features(asset_features,
                                                    np.where(np.isin(feature_names,lagged_cols))[0],
                                                    period=period,
                                                    out_features=current_features)
        # Predict
        current_features = np.array(current_features).reshape(1,-1)
        
        pred = bst.predict(np.array(current_features))[0]
        prediction_df.loc[prediction_df.row_id == row_id, 'Target'] = pred
        
        # Append data
        new_features.append(current_features)

    new_features = np.array(new_features)

    # Append to old array and shift one forward
    features = np.append(features, new_features,axis=1)
    features = features[:,1:,:]
    
    assert(features.shape == original_shape)
    
    submissions.append(prediction_df)
    #env.predict(prediction_df.dropna(axis=0, subset=['row_id']))
    

  0%|                                                                                                                                                               | 46/144000 [00:02<2:06:28, 18.97it/s]

In [62]:
submission_df = pd.concat(submissions)

In [67]:
submission_df.row_id.isna().any()

False

In [64]:
submission_df.Target.isna().any()

False

# Fitting

In [None]:
# Load training data 
train_dir = '../data/gresearch/processed/fold_6/train/'
test_dir = '../data/gresearch/processed/fold_6/test/'

In [None]:
train_df = load_fold(train_dir)
test_df = load_fold(test_dir)

In [None]:
feature_cols = np.array([col for col in train_df.columns[4:]])
target_col = ['Target']

In [None]:
lgb_train_data = lgb.Dataset(train_df[feature_cols],
                             label=train_df[target_col])

In [None]:
param = {'objective': 'regression', 'metric': 'mae'}
num_round = 20

bst = lgb.train(param, lgb_train_data, num_round)

In [None]:
# Save model
bst.save_model('../models/lgbmBaseline.txt')

In [None]:
feature_cols = np.array([col for col in train_df.columns[4:]])
target_col = ['Target']

In [None]:
feature_cols

In [None]:
lgb_train_data = lgb.Dataset(train_df[feature_cols],
                             label=train_df[target_col])

In [None]:
param = {'objective': 'regression', 'metric': 'mae'}
num_round = 20

bst = lgb.train(param, lgb_train_data, num_round)

In [None]:
# Save model
bst.save_model('../models/lgbmBaseline.txt')

In [None]:
pred = pd.DataFrame({'Asset_ID': train_df.Asset_ID,
                     'timestamp': train_df.timestamp})

pred['prediction'] = bst.predict(train_df[feature_cols])


test = train_df[['Target', 'Weight', 'timestamp', 'Asset_ID']].merge(
    pred, on=['timestamp', 'Asset_ID'], how='left').fillna(0)

score = corr_score(test.Target, test.prediction, test.Weight)
print(score)

In [None]:
sns.scatterplot(test.Target, test.prediction)

In [None]:
# Out of sample
pred = pd.DataFrame({'Asset_ID': test_df.Asset_ID,
                     'timestamp': test_df.timestamp})

pred['prediction'] = bst.predict(test_df[feature_cols])


test = test_df[['Target', 'Weight', 'timestamp', 'Asset_ID']].merge(
    pred, on=['timestamp', 'Asset_ID'], how='left').fillna(0)

score = corr_score(test.Target, test.prediction, test.Weight)
print(score)

In [None]:
stats.pearsonr(test.Target,test.prediction)

In [None]:
sns.scatterplot(test.Target, test.prediction)

In [None]:
np.corrcoef((test.Target,test.prediction))

In [None]:
shap_values = shap.TreeExplainer(bst).shap_values(train_df[feature_cols])

In [None]:
shap.summary_plot(shap_values, train_df[feature_cols])

In [None]:
sns.scatterplot(x='log_change_High_60min', y='Target', data=train_df)

In [None]:
sns.histplot(train_df.Target)