In [None]:
import pandas as pd
import numpy as np
import vectorbt as vbt
import yfinance as yf
import statsmodels.api as sm
import plotly.graph_objects as go

from sklearn.linear_model import LinearRegression
from statsmodels.tsa.stattools import coint, adfuller

In [None]:
with pd.HDFStore('./large_files/data_20240328.h5', mode='r') as store:
    df1_ = store.get('data/cl')
    df2_ = store.get('data/gc')

In [None]:
def filter_df(df):
    df = df.set_index('datetime', drop=False)
    df = df.loc[df.index.year.isin([2020, 2021, 2022, 2023])]
    df = df.loc[df.index.day_of_week == 2]
    df = df[(df.index.hour >= 9) & (df.index.hour < 12)]

    return df

df1 = filter_df(df1_)
df2 = filter_df(df2_)

In [None]:
# Plot prices in scatter chart
fig = go.Figure()
fig.add_trace(go.Scatter(x=df2.close, y=df1.close, mode='markers'))
fig.update_layout(title_text='Gold vs WTI Crude', template='simple_white')
fig.show()

score, pvalue, _ = coint(df1.close, df2.close)
pvalue

In [None]:
start = '2020-01-01'
end = '2023-12-31'
df11 = yf.download('CL=F', start=start, end=end)['Close']
df12 = yf.download('GC=F', start=start, end=end)['Close']

fig = go.Figure()
fig.add_trace(go.Scatter(x=df12, y=df11, mode='markers'))
fig.update_layout(title_text='Gold vs WTI Crude', template='simple_white')
fig.show()

# Spread
1. Spread needs to be updated from time to time as prices change over time
2. Compute spread using Monday and Tuesday

In [None]:
lr = LinearRegression()
lr.fit(df1.close.values.reshape(-1, 1), df2.close.values.reshape(-1, 1))
print(lr.coef_, lr.intercept_)

hedge_ratio = lr.coef_[0]

In [None]:
def filter_df(df):
    df = df.set_index('datetime', drop=False)
    df = df.loc[df.index.year.isin([2020, 2021, 2022, 2023])]
    df = df.loc[df.index.day_of_week.isin([0, 1, 2])]
    df = df[(df.index.hour >= 9) & (df.index.hour < 12)]

    return df

df1 = filter_df(df1_)
df2 = filter_df(df2_)
dates = pd.Series(index=df1.datetime, dtype=bool)

df_z = []

weekly_groups = dates.groupby(pd.Grouper(freq='W'))
for week_start, group in weekly_groups:
    # get monday / tuesday of pair, week_start = Sunday
    monday = week_start - pd.Timedelta(days=6)
    tuesday = week_start - pd.Timedelta(days=4, minutes=1)
    wednesday = week_start - pd.Timedelta(days=3, minutes=1)

    df1_week = df1.loc[(df1.index >= monday) & (df1.index <= tuesday)]
    df2_week = df2.loc[(df2.index >= monday) & (df2.index <= tuesday)]
    df1_wed = df1.loc[(df1.index > tuesday) & (df1.index <= wednesday)]
    df2_wed = df2.loc[(df2.index > tuesday) & (df2.index <= wednesday)]

    if (df1_week.shape[0] == 0) or (df2_week.shape[0] == 0) or (df1_wed.shape[0] == 0) or (df2_wed.shape[0] == 0):
        continue
    
    if df1_week.shape[0] > df2_week.shape[0]:
        df1_week = df1_week.loc[df2_week.index]
    elif df2_week.shape[0] > df1_week.shape[0]:
        df2_week = df2_week.loc[df1_week.index]

    # compute mean / std for spread
    price1 = df1_week.close.values.reshape(-1, 1)
    price2 = df2_week.close.values.reshape(-1, 1)
    model = LinearRegression()
    model.fit(price1, price2)
    hedge_ratio = model.coef_[0]
    spread = price2 - hedge_ratio * price1
    mean = spread.mean()
    std = np.std(spread)

    # compute z-scores for wednesday
    spread = df2_wed.close.values - hedge_ratio * df1_wed.close.values

    z = pd.DataFrame({'asset1': df1_wed.close.values, 'asset2': df2_wed.close.values, 'spread': spread, 'z': (spread - mean) / std}, index=df1_wed.index)
    df_z.append(z)

df_z = pd.concat(df_z)

In [None]:
def pair_trade_signals(
    dfz,
    short_entry_threshold=2.0, # sell 1 buy 2
    long_entry_threshold=-2.0, # buy 1 sell 2

):  
    positions = []

    for day, df_day in dfz.groupby(pd.Grouper(level='datetime', freq='D')):
        position = pd.Series(index=df_day.index)
        is_open = False
        for time, row in df_day.iterrows():
            z_score = row['z']
            if is_open and (time.time() == pd.Timestamp('11:59').time()):
                position[time] = 0
                break
            elif ((time.time() >= pd.Timestamp('9:00').time()) and (time.time() < pd.Timestamp('11:00').time())) and not is_open:
                if z_score > short_entry_threshold:
                    position[time] = -1
                    is_open = 1
                elif z_score < long_entry_threshold:
                    position[time] = 1
                    is_open = 2
            elif (is_open == 1 and z_score <= 0) or (is_open == 2 and z_score >= 0):
                position[time] = 0
                is_open = False
                break
        position = position.ffill()
        positions.append(position)
    
    positions = pd.concat(positions)
    df_z = pd.concat([dfz, positions], axis=1).rename({0: 'position'}, axis=1)

    return df_z

In [None]:
def filter_df(df):
    df = df.set_index('datetime', drop=False)
    df = df.loc[df.index.year.isin([2020, 2021, 2022, 2023])]
    df = df.loc[df.index.day_of_week.isin([2])]
    df = df[(df.index.hour >= 9) & (df.index.hour < 12)]

    return df

df1 = filter_df(df1_)
df2 = filter_df(df2_)

df_pos = pair_trade_signals(df_z)
df_pos['cl'] = -df_pos['position']
df_pos['gc'] = df_pos['position']
df_pos.head()

In [None]:
vbt_short_signal = ((df_pos['position'] == -1) & (df_pos['position'].shift(1) != -1))
vbt_long_signal = ((df_pos['position'] == 1) & (df_pos['position'].shift(1) != 1))

vbt_close_short = ((df_pos['position'] == 0) & (df_pos['position'].shift(1) == -1))
vbt_close_long = ((df_pos['position'] == 0) & (df_pos['position'].shift(1) == 1))

In [None]:
asset1 = 'cl'
asset2 = 'gc'

tickers_column = pd.Index([asset1, asset2], name='tickers')
vbt_trades = pd.DataFrame(index=df_pos.index, columns=tickers_column)
vbt_trades[asset1] = np.nan
vbt_trades[asset2] = np.nan

vbt_trades.loc[vbt_short_signal, asset1] = 1
vbt_trades.loc[vbt_long_signal, asset1] = -1
vbt_trades.loc[vbt_short_signal, asset2] = -1
vbt_trades.loc[vbt_long_signal, asset2] = 1

vbt_trades.loc[vbt_close_short, asset1] = 0
vbt_trades.loc[vbt_close_long, asset1] = 0
vbt_trades.loc[vbt_close_short, asset2] = 0
vbt_trades.loc[vbt_close_long, asset2] = 0

# Shift trades by one candle (avoid look-ahead bias)
# vbt_trades = vbt_trades.vbt.fshift(1)

In [None]:
df_price = pd.DataFrame(index=df_z.index, columns=['cl', 'gc'])
df_price['cl'] = df1.loc[df_z.index, 'close'].values
df_price['gc'] = df2.loc[df_z.index, 'close'].values

In [None]:
portfolio = vbt.Portfolio.from_orders(
    close=df_price,
    size=vbt_trades,
    # price=df_price,
    size_type='targetpercent',
    val_price=df_price.vbt.fshift(1),
    init_cash=100,
    fees=0,
    cash_sharing=True,
    group_by=False,
    call_seq='auto',
    freq='1m'
)
portfolio.stats()

In [None]:
portfolio[0].plot(
    subplots='orders',
    title=asset1
).show()

portfolio[1].plot(
    subplots='orders',
    title=asset2
).show()

In [None]:
orders = portfolio.orders.records_readable.sort_values('Timestamp')

In [None]:
portfolio2 = vbt.Portfolio.from_orders(
    close=df_price,
    size=vbt_trades,
    # price=df_price,
    size_type='targetpercent',
    val_price=df_price.vbt.fshift(1),
    init_cash=100,
    fees=0,
    cash_sharing=True,
    group_by=True,
    call_seq='auto',
    freq='1m'
)

In [None]:
portfolio2.plot().show()