In [43]:
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import pandas as pd
import numpy as np
import backtrader as bt

from plotly.subplots import make_subplots

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from utils.common import day_aware_shift, plot_confusion_matrix
from utils.features import create_lag, create_rsi, create_dst, create_ma_ratio, create_z
from utils.model import XGBoostClassifier, LogisticClassifier

import os
import ta
import tabulate
import datetime
import tqdm
import pickle

In [44]:
df_wti_1m = pickle.load(open('./large_files/cl-1m.pkl', 'rb'))
df_wti_1m = df_wti_1m[(df_wti_1m['datetime'].dt.hour >= 8) & (df_wti_1m['datetime'].dt.hour <= 16)]

df = df_wti_1m.copy()
df = df[(df['datetime'].dt.hour >= 8) & (df['datetime'].dt.hour < 16)]
# df = df[(df['datetime'].dt.weekday == 2) & (df['datetime'].dt.year >= 2010) & (df['datetime'].dt.year <= 2019)]
df['month'] = df['datetime'].dt.month
df['month'] = pd.Categorical(df['month'])
df.head()

Unnamed: 0,date,time,open,high,low,close,volume,datetime,day,month
583,2007-04-02,800,65.66,65.69,65.66,65.69,6,2007-04-02 08:00:00-04:00,Monday,4
584,2007-04-02,801,65.67,65.67,65.65,65.66,3,2007-04-02 08:01:00-04:00,Monday,4
585,2007-04-02,802,65.67,65.7,65.67,65.7,13,2007-04-02 08:02:00-04:00,Monday,4
586,2007-04-02,803,65.69,65.7,65.68,65.7,6,2007-04-02 08:03:00-04:00,Monday,4
587,2007-04-02,804,65.7,65.77,65.7,65.77,16,2007-04-02 08:04:00-04:00,Monday,4


In [45]:
def feature_pipeline(df):
    for period in [5, 15, 30, 60]:
        df = create_z(df, period, 'close')
        df = create_z(df, period, 'volume')
        df = create_dst(df, [period])
        df = create_rsi(df, [period])

        for sub_period in [5, 15, 30, 60]:
            df = create_lag(df, f'z_close_{period}m', sub_period)
            df = create_lag(df, f'z_volume_{period}m', sub_period)
            df = create_lag(df, f'dst_high_{period}m', sub_period)
            df = create_lag(df, f'dst_low_{period}m', sub_period)

            if sub_period > period:
                df = create_ma_ratio(df, period, sub_period)

    return df

def target_pipeline(df):
    def get_trend(x):
        if x > 0.5:
            return 'up'
        elif x < -0.5:
            return 'down'
        else:
            return 'neutral'
    df['trend_30m_z5'] = df['z_close_30m'].apply(get_trend)
    df = day_aware_shift(df, 'trend_30m_z5', -5)

    return df

In [46]:
df = feature_pipeline(df)
df.info()

100%|██████████| 6085/6085 [00:05<00:00, 1101.14it/s]
  0%|          | 0/6085 [00:00<?, ?it/s]

100%|██████████| 6085/6085 [00:05<00:00, 1080.23it/s]
100%|██████████| 6085/6085 [00:08<00:00, 756.90it/s]
100%|██████████| 6085/6085 [00:06<00:00, 874.31it/s]
100%|██████████| 6085/6085 [00:03<00:00, 1706.79it/s]
100%|██████████| 6085/6085 [00:03<00:00, 1793.08it/s]
100%|██████████| 6085/6085 [00:03<00:00, 1811.96it/s]
100%|██████████| 6085/6085 [00:05<00:00, 1049.52it/s]
100%|██████████| 6085/6085 [00:06<00:00, 992.73it/s] 
100%|██████████| 6085/6085 [00:08<00:00, 703.05it/s]
100%|██████████| 6085/6085 [00:07<00:00, 837.46it/s]
100%|██████████| 6085/6085 [00:03<00:00, 1718.72it/s]
100%|██████████| 6085/6085 [00:03<00:00, 1835.48it/s]
100%|██████████| 6085/6085 [00:05<00:00, 1112.79it/s]
100%|██████████| 6085/6085 [00:05<00:00, 1052.66it/s]
100%|██████████| 6085/6085 [00:08<00:00, 707.30it/s]
100%|██████████| 6085/6085 [00:07<00:00, 857.06it/s]
100%|██████████| 6085/6085 [00:03<00:00, 1650.05it/s]
100%|██████████| 6085/6085 [00:06<00:00, 975.75it/s] 
100%|██████████| 6085/6085 [00:06<

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2040372 entries, 0 to 2040371
Columns: 136 entries, date to lag60m_dst_low_60m
dtypes: category(1), datetime64[ns, America/New_York](1), float64(130), int64(1), object(3)
memory usage: 2.1+ GB


In [47]:
df = target_pipeline(df)

In [48]:
# Wednesday
df2 = df[df['datetime'].dt.hour >= 9]
df2 = df2[(df2['datetime'].dt.year >= 2010)]
df2 = df2[df2['datetime'].dt.day_of_week == 2]
df2 = df2.dropna(axis=0)
df2['month'] = pd.Categorical(df2['month'])

to_drop = ['date', 'datetime', 'time', 'day', 'open', 'high', 'low', 'close', 'volume', 'trend_30m_z5'] + \
    [f for f in df.columns if 'sma' in f]

Xtr = df2[df2['datetime'].dt.year < 2018]
Xvl = df2[df2['datetime'].dt.year == 2018]
Xte = df2[df2['datetime'].dt.year == 2019]

le = LabelEncoder()

ytr = le.fit_transform(Xtr['trend_30m_z5']).reshape(-1, 1)
yvl = le.transform(Xvl['trend_30m_z5']).reshape(-1, 1)
yte = le.transform(Xte['trend_30m_z5']).reshape(-1, 1)

Xtr = Xtr.drop(to_drop, axis=1)
Xvl = Xvl.drop(to_drop, axis=1)
Xte = Xte.drop(to_drop, axis=1)

feature_columns = Xtr.columns.tolist()

Xtr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 171103 entries, 337052 to 1313087
Columns: 119 entries, month to lag60m_dst_low_60m
dtypes: category(1), float64(118)
memory usage: 155.5 MB


In [49]:
params = {
    'objective': 'multi:softmax',
    'random_state': 42,
    'learning_rate': 0.05,
    'nthread': -1,
    'max_depth': 5,
    'early_stopping_rounds': 10,
    'tree_method': 'hist',
    'enable_categorical': True,
    'num_class': 3,
    'device': 'cuda'
}

xgb = XGBoostClassifier(params=params)
xgb.fit((Xtr, ytr), (Xvl, yvl), le.classes_)

Training XGBClassifier with the following params: {'objective': 'multi:softmax', 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': 'cuda', 'eval_metric': None, 'gamma': None, 'grow_policy': None, 'interaction_constraints': None, 'learning_rate': 0.05, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 5, 'max_leaves': None, 'min_child_weight': None, 'monotone_constraints': None, 'multi_strategy': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': 'hist', 'validate_parameters': None, 'verbosity': None, 'nthread': -1, 'num_class': 3}




Training Results:
+-----------+------+
| accuracy  | 0.64 |
| precision | 0.61 |
| recall    | 0.64 |
| f1        | 0.59 |
+-----------+------+

Validation Results:
+-----------+------+
| accuracy  | 0.64 |
| precision | 0.61 |
| recall    | 0.64 |
| f1        | 0.6  |
+-----------+------+


# Running the model on different days

In [50]:
for day in ['Monday', 'Tuesday', 'Thursday', 'Friday']:
    df_day = df[df['datetime'].dt.hour >= 9]
    df_day = df_day[df_day['datetime'].dt.day_name() == day]
    df_day = df_day[df_day['datetime'].dt.year == 2018]
    df_day = df_day.dropna(axis=0)
    df_day['month'] = pd.Categorical(df_day['month'])

    y = le.transform(df_day['trend_30m_z5']).reshape(-1, 1)
    X = df_day.drop(to_drop, axis=1)

    y_pred = xgb.model.predict(X)
    print(day)
    print(f'Precision: {precision_score(y, y_pred, average="weighted"):.2f}')
    print(f'Recall: {recall_score(y, y_pred, average="weighted"):.2f}')
    print(f'F1: {f1_score(y, y_pred, average="weighted"):.2f}')
    print()

Monday
Precision: 0.62
Recall: 0.65
F1: 0.61

Tuesday
Precision: 0.61
Recall: 0.65
F1: 0.60

Thursday
Precision: 0.61
Recall: 0.64
F1: 0.59

Friday
Precision: 0.60
Recall: 0.63
F1: 0.59



# Backtrader

In [105]:
class XGBoostStrategy(bt.Strategy):
    def __init__(self):
        self.data_predicted = None
        self.track_position = {}

    def next(self):
        bar = self.datas[0]
        df = pd.DataFrame({f: getattr(bar, f)[0] for f in feature_columns}, index=[0], columns=feature_columns)
        df['month'] = pd.Categorical(df['month'])

        y = xgb.model.predict(df)[0]
        trend = le.inverse_transform([y])[0]

        # manage the previous position first
        # if there is an existing position, close it starting from the 5th iteration
        for position, entry_iteration in list(self.track_position.items()):
            self.track_position[position] += 1
            if self.track_position[position] >= 5:
                self.log(f'Closing position held for 5 iterations')
                self.close(position)

        # Make trading decisions based on the prediction
        if trend == 'up':
            size = int(self.broker.get_cash() / self.datas[0].open)
            if size > 1:
                self.log(f'BUY {size} at {self.data.close[0]:.2f}')
                self.order = self.buy(size=size)
                self.track_position[self.order] = 0
            else:
                self.log(f'Not enough cash')
        elif trend == 'down':
            for position in self.track_position.copy():
                size = position.size
                if size > 0:
                    self.log(f'SELL {size} at {self.data.close[0]:.2f}')
                    self.order = self.sell(position=position)
                    del self.track_position[position]
                else:
                    self.log(f'No shares to sell')
        else:
            self.log(f'No Action')

    def log(self, txt, dt=None):
        dt = dt or self.datas[0].datetime.datetime()
        print(
            f'{dt}: Cash: {self.broker.getcash():.2f} Portfolio: {self.broker.getvalue():.2f} Action: {txt}'
        )

class PandasData(bt.feeds.PandasData):
    lines = (
        'open',
        'high',
        'low',
        'close',
        'volume',
        'month',
        'z_close_5m',
        'pct_close_5m',
        'std_close_5m',
        'sma_close_5m',
        'z_volume_5m',
        'pct_volume_5m',
        'std_volume_5m',
        'sma_volume_5m',
        'dst_high_5m',
        'dst_low_5m',
        'dst_mean_5m',
        'dst_mean_high_5m',
        'dst_mean_low_5m',
        'rsi_5',
        'lag5m_z_close_5m',
        'lag5m_z_volume_5m',
        'lag5m_dst_high_5m',
        'lag5m_dst_low_5m',
        'lag15m_z_close_5m',
        'lag15m_z_volume_5m',
        'lag15m_dst_high_5m',
        'lag15m_dst_low_5m',
        'ma_ratio_5_15',
        'lag30m_z_close_5m',
        'lag30m_z_volume_5m',
        'lag30m_dst_high_5m',
        'lag30m_dst_low_5m',
        'ma_ratio_5_30',
        'lag60m_z_close_5m',
        'lag60m_z_volume_5m',
        'lag60m_dst_high_5m',
        'lag60m_dst_low_5m',
        'ma_ratio_5_60',
        'z_close_15m',
        'pct_close_15m',
        'std_close_15m',
        'sma_close_15m',
        'z_volume_15m',
        'pct_volume_15m',
        'std_volume_15m',
        'sma_volume_15m',
        'dst_high_15m',
        'dst_low_15m',
        'dst_mean_15m',
        'dst_mean_high_15m',
        'dst_mean_low_15m',
        'rsi_15',
        'lag5m_z_close_15m',
        'lag5m_z_volume_15m',
        'lag5m_dst_high_15m',
        'lag5m_dst_low_15m',
        'lag15m_z_close_15m',
        'lag15m_z_volume_15m',
        'lag15m_dst_high_15m',
        'lag15m_dst_low_15m',
        'lag30m_z_close_15m',
        'lag30m_z_volume_15m',
        'lag30m_dst_high_15m',
        'lag30m_dst_low_15m',
        'ma_ratio_15_30',
        'lag60m_z_close_15m',
        'lag60m_z_volume_15m',
        'lag60m_dst_high_15m',
        'lag60m_dst_low_15m',
        'ma_ratio_15_60',
        'z_close_30m',
        'pct_close_30m',
        'std_close_30m',
        'sma_close_30m',
        'z_volume_30m',
        'pct_volume_30m',
        'std_volume_30m',
        'sma_volume_30m',
        'dst_high_30m',
        'dst_low_30m',
        'dst_mean_30m',
        'dst_mean_high_30m',
        'dst_mean_low_30m',
        'rsi_30',
        'lag5m_z_close_30m',
        'lag5m_z_volume_30m',
        'lag5m_dst_high_30m',
        'lag5m_dst_low_30m',
        'lag15m_z_close_30m',
        'lag15m_z_volume_30m',
        'lag15m_dst_high_30m',
        'lag15m_dst_low_30m',
        'lag30m_z_close_30m',
        'lag30m_z_volume_30m',
        'lag30m_dst_high_30m',
        'lag30m_dst_low_30m',
        'lag60m_z_close_30m',
        'lag60m_z_volume_30m',
        'lag60m_dst_high_30m',
        'lag60m_dst_low_30m',
        'ma_ratio_30_60',
        'z_close_60m',
        'pct_close_60m',
        'std_close_60m',
        'sma_close_60m',
        'z_volume_60m',
        'pct_volume_60m',
        'std_volume_60m',
        'sma_volume_60m',
        'dst_high_60m',
        'dst_low_60m',
        'dst_mean_60m',
        'dst_mean_high_60m',
        'dst_mean_low_60m',
        'rsi_60',
        'lag5m_z_close_60m',
        'lag5m_z_volume_60m',
        'lag5m_dst_high_60m',
        'lag5m_dst_low_60m',
        'lag15m_z_close_60m',
        'lag15m_z_volume_60m',
        'lag15m_dst_high_60m',
        'lag15m_dst_low_60m',
        'lag30m_z_close_60m',
        'lag30m_z_volume_60m',
        'lag30m_dst_high_60m',
        'lag30m_dst_low_60m',
        'lag60m_z_close_60m',
        'lag60m_z_volume_60m',
        'lag60m_dst_high_60m',
        'lag60m_dst_low_60m'
    )
    params = {c: -1 for c in lines}
    params = tuple(params.items())

In [106]:
to_drop = ['date', 'datetime', 'time', 'day', 'open', 'high', 'low', 'close', 'volume', 'trend_30m_z5'] + \
    [f for f in df.columns if 'sma' in f]
df2 = df[(df['datetime'].dt.hour >= 9)]
df2 = df2[(df2['datetime'].dt.year == 2021)]
df2 = df2[df2['datetime'].dt.day_of_week == 2]
df2 = df2.dropna(axis=0)
# df2 = df2.drop(to_drop, axis=1)
df2['month'] = pd.Categorical(df2['month'])

count = 0
for day, df_day in df2.groupby(pd.Grouper(key='datetime', freq='D')):
    if df_day.shape[0] == 0:
        continue
    df_day = df_day.set_index('datetime', drop=False)
    df_day = df_day.drop(['day', 'trend_30m_z5', 'time', 'date'], axis=1)
    df_day = df_day.copy()

    cerebro = bt.Cerebro()
    cerebro.addstrategy(XGBoostStrategy)
    cerebro.adddata(PandasData(dataname=df_day))
    cerebro.broker.setcash(100000.0)
    results = cerebro.run()
    print(f'{day.date()} Ending Portfolio Value: {cerebro.broker.getvalue():.2f}')
    print()

    count += 1
    if count == 1: break

2021-01-06 14:01:00: Cash: 100000.00 Portfolio: 100000.00 Action: BUY 2000 at 50.12


TypeError: unhashable type: 'BuyOrder'

In [53]:
cerebro.plot()

<IPython.core.display.Javascript object>

[[<Figure size 640x480 with 4 Axes>]]