In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../")

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy import stats
import shap
import lightgbm as lgb

from src.utils import load_fold, load_gresearch_raw
from src.evaluation import corr_score
from src.settings import *
from src.features import engineer_all_features
from src.models import CryptoDART

plt.rcParams["figure.figsize"] = (16,9)

# Fitting Pipeline LGBM

In [4]:
asset_info = pd.read_csv('../data/gresearch/raw/asset_details.csv')

In [5]:
drop_cols =['Count_x', 'Open_x', 'High_x', 'Low_x', 'Close_x', 'Volume_x', 'VWAP_x']
rename = ['Count_y', 'Open_y', 'High_y', 'Low_y', 'Close_y', 'Volume_y', 'VWAP_y']
new_names = [c.split('_')[0] for c in rename]
rename_dict = dict(zip(rename, new_names))

In [24]:
fold = 8
model_save_dir = f'../models/fold_{fold}/'
result_save_path = f'../experiments/dart_base/results_fold_{fold}.csv'
base_name = 'dart_base'

In [6]:
train = load_fold('../data/gresearch/processed/fold_8/train/').drop(columns=drop_cols).rename(columns=rename_dict).dropna()
test = load_fold('../data/gresearch/processed/fold_8/test/').drop(columns=drop_cols).rename(columns=rename_dict).dropna()

In [7]:
target = 'Target'
features = [col for col in train.columns if col not in non_train_cols]

In [None]:
params = {'objective': 'mae',
          'boosting' : 'dart',
          'num_iterations' : 200,
          'learning_rate': 0.1,
          'num_leaves': 20,
          'tree_learner': 'feature',
          'num_threads': 2,
          'max_depth': 40,
          'min_data_in_leaf': 40,
          'feature_fraction': 0.8,
          'lambda_l1': 0.01,
          'lambda_l2': 0.01,
          'drop_rate': 0.15,
          'skip_drop': 0.5,
          }

In [9]:

cryptoDart = CryptoDART(assets=assets,
                        weights=weights,
                        names=names,
                        params=dart_base_params)


cryptoDart.make_data(features=features, target=target, test=test, train=train)
cryptoDart.train()

Making data for 0
Making data for 1
Making data for 2
Making data for 3
Making data for 4
Making data for 5
Making data for 6
Making data for 7
Making data for 8
Making data for 9
Making data for 10
Making data for 11
Making data for 12
Making data for 13
Data creation done
------------



In [10]:
cryptoDart.train()

Training 0


Found `num_iterations` in params. Will use it instead of argument


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 119850
[LightGBM] [Info] Number of data points in the train set: 128880, number of used features: 470
[LightGBM] [Info] Start training from score -0.000065
------------

Training 1
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 119850
[LightGBM] [Info] Number of data points in the train set: 128880, number of used features: 470
[LightGBM] [Info] Start training from score 0.000001
------------

Training 2
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 119850
[LightGBM] [Info] Number of data points in the train set: 128880, number of used features: 470
[LightGBM] [Info] Start training from score -0.000069
------------

Training 3
You c

In [11]:
cryptoDart.run_full_test()

Running test


In [22]:
cryptoDart.save_models(model_save_dir, base_name)

In [27]:
cryptoDart.save_test_results(result_save_path)

In [13]:
print(f'Done with fold {fold}, test competition score {cryptoDart.test_score}')

Done with fold 8, test competition score 0.026152597060158517


# Eval

In [None]:
prediction = dart.predict(btc_train[features])

In [None]:
stats.pearsonr(prediction, btc_train.Target.values)

In [None]:
sns.scatterplot(x= btc_train.Target.values, y=prediction)

# Out of sample

In [None]:
prediction = dart.predict(btc_test[features])

In [None]:
stats.pearsonr(prediction, btc_test.Target.values)

In [None]:
sns.scatterplot(x= btc_test.Target.values, y=prediction)

# Shap

In [None]:
shap_values = shap.TreeExplainer(dart).shap_values(btc_valid[features])

In [None]:
shap.summary_plot(shap_values, btc_valid[features])