In [1]:
## anaconda3 (Python 3.12.0) Kernel
import numpy as np

# pair trade packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from datetime import datetime

# Load Pairs Data


In [3]:
def custom_date_parser(date_str):
    return datetime.strptime(date_str, '%d/%m/%Y')

# Load the dictionary from the pickle file
with open('pairsOutcome.pkl', 'rb') as file:
    pairsOutcome = pickle.load(file)

print("Dictionary loaded from pairsOutcome.pkl")


# Load stock data and get return 
tpxData = pd.read_csv('TPX_prices.csv', index_col=0, parse_dates=True, date_parser=custom_date_parser)
tpxData = tpxData.dropna(axis='columns')
return_df = (tpxData / tpxData.shift(1)) - 1

Dictionary loaded from pairsOutcome.pkl


  tpxData = pd.read_csv('TPX_prices.csv', index_col=0, parse_dates=True, date_parser=custom_date_parser)


# Get Pair Trade Portfolio
`pairsOutcome` already have TOPIX stocks with highest liquidity and are tested for stationarity over a 1 year window

Choose top 10 known pair trades by returns in the total dataset

In [64]:
# Sort the keys by their cumpnl[-2] values in descending order
working_pairs = [('5401 JP Equity', '8604 JP Equity'), ('6273 JP Equity', '9984 JP Equity'), ('8053 JP Equity', '8058 JP Equity'), ('7733 JP Equity', '9613 JP Equity'), ('4684 JP Equity', '7832 JP Equity'), ('6762 JP Equity', '6857 JP Equity'), ('9020 JP Equity', '9022 JP Equity'), ('7267 JP Equity', '8306 JP Equity'), ('8308 JP Equity', '8802 JP Equity'), ('4901 JP Equity', '6702 JP Equity'), ('6503 JP Equity', '7269 JP Equity'), ('7267 JP Equity', '8801 JP Equity'), ('4519 JP Equity', '7532 JP Equity'), ('6988 JP Equity', '7267 JP Equity'), ('6326 JP Equity', '6954 JP Equity'), ('6752 JP Equity', '8604 JP Equity'), ('4901 JP Equity', '9613 JP Equity')]
top_keys = [f"{pair[0]} {pair[1]}" for pair in working_pairs]

# Print the top 10 performing trades
print("17 clustered pair trades:")
for i, key in enumerate(top_keys, 1):
    print(f"{i}. Key: {key}, Value: {pairsOutcome[key].cumpnl.iloc[-2]:.2f}")

17 clustered pair trades:
1. Key: 5401 JP Equity 8604 JP Equity, Value: 0.01
2. Key: 6273 JP Equity 9984 JP Equity, Value: 0.99
3. Key: 8053 JP Equity 8058 JP Equity, Value: 0.52
4. Key: 7733 JP Equity 9613 JP Equity, Value: 0.34
5. Key: 4684 JP Equity 7832 JP Equity, Value: 0.89
6. Key: 6762 JP Equity 6857 JP Equity, Value: -0.67
7. Key: 9020 JP Equity 9022 JP Equity, Value: 0.31
8. Key: 7267 JP Equity 8306 JP Equity, Value: 1.16
9. Key: 8308 JP Equity 8802 JP Equity, Value: 0.43
10. Key: 4901 JP Equity 6702 JP Equity, Value: -0.34
11. Key: 6503 JP Equity 7269 JP Equity, Value: 1.33
12. Key: 7267 JP Equity 8801 JP Equity, Value: 0.64
13. Key: 4519 JP Equity 7532 JP Equity, Value: 1.14
14. Key: 6988 JP Equity 7267 JP Equity, Value: 0.65
15. Key: 6326 JP Equity 6954 JP Equity, Value: 1.19
16. Key: 6752 JP Equity 8604 JP Equity, Value: -0.48
17. Key: 4901 JP Equity 9613 JP Equity, Value: 1.10


In [8]:
## Get pair stock data
def custom_date_parser(date_str):
    return datetime.strptime(date_str, '%d/%m/%Y')
valid = pd.read_csv('validPairs5.csv', 
                    index_col=0, 
                    parse_dates=True, 
                    date_parser=custom_date_parser)
## get list of pair stocks
validPairsList = [
    [item.strip() + ' Equity' for item in pair.split('Equity') if item.strip()]
    for pair in top_keys
]

  valid = pd.read_csv('validPairs5.csv',


In [9]:
rollingWindow = 262
cutLossSd = 2

In [10]:
for pair in validPairsList:
    df = pd.DataFrame()

    #Calculate Standard Deviations
    df['spread'] = valid[f'spread_{pair[0]}_{pair[1]}']
    df['mid'] =  df['spread'].rolling(rollingWindow).mean()
    df['1sd high'] = df['spread'].rolling(rollingWindow).mean() + df['spread'].rolling(rollingWindow).std()
    df['1sd low'] = df['spread'].rolling(rollingWindow).mean() - df['spread'].rolling(rollingWindow).std()
    df['2sd high'] = df['spread'].rolling(rollingWindow).mean() + df['spread'].rolling(rollingWindow).std() * cutLossSd
    df['2sd low'] = df['spread'].rolling(rollingWindow).mean() - df['spread'].rolling(rollingWindow).std() * cutLossSd
    df['position'] = 0

    df.loc[(df['spread'] > df['1sd high']) & (df['spread'] < df['2sd high']), 'position'] = -1
    df.loc[(df['spread']< df['1sd low']) & (df['spread'] > df['2sd low']), 'position'] = 1

    #Calculate PnL
    df[f'{pair[0]} position'] = df['position']
    df[f'{pair[1]} position'] = df['position'] * -1
    df['dailypnl'] = df[f'{pair[1]} position']*return_df[f'{pair[1]}'].shift(-1) + df[f'{pair[0]} position']*return_df[f'{pair[0]}'].shift(-1)
    df['cumpnl'] = df['dailypnl'].cumsum()

    pairsOutcome[f'{pair[0]} {pair[1]}'] = df

## Make indicators and spread stationary around 0
Deduct the mean from all values to translate to 0 axis

In [11]:
workingPairOutcome = {}

for pair in top_keys:
    dummy_df = pairsOutcome[top_keys[0]].iloc[::,:6]
    dummy_df = dummy_df.subtract(dummy_df['mid'], axis=0).drop(columns=['mid']) # centre spread and SD
    dummy_df = dummy_df.div(dummy_df['2sd high']-dummy_df['1sd high'],axis=0)   # express SD as integers, give spread as propotionate
    dummy_df['2sd_high_boolean'] = (dummy_df['spread']>dummy_df['2sd high']).astype(int)
    dummy_df['1sd_high_boolean'] = (dummy_df['spread']>dummy_df['1sd high']).astype(int)
    dummy_df['0sd_high_boolean'] = (dummy_df['spread']>0).astype(int)
    dummy_df['0sd_low_boolean']  = (dummy_df['spread']<0).astype(int)
    dummy_df['1sd_low_boolean']  = (dummy_df['spread']<dummy_df['1sd low'] ).astype(int)
    dummy_df['2sd_low_boolean']  = (dummy_df['spread']<dummy_df['2sd low'] ).astype(int)
    dummy_df = dummy_df.drop(columns=['spread','1sd high', '1sd low', '2sd high', '2sd low'])
    workingPairOutcome[pair] = dummy_df.to_numpy()

In [12]:
workingPairOutcome[top_keys[5]][-5:]     # spread is not a proportion and direction of SD

array([[0, 0, 0, 1, 1, 1],
       [0, 0, 0, 1, 1, 0],
       [0, 0, 0, 1, 1, 0],
       [0, 0, 0, 1, 1, 0],
       [0, 0, 0, 1, 1, 0]])

# Machine Learning Challenge

## Background
Initial evaluation of the baseline portfolio shows that draw downs are small. Originally team had the idea of using Machine Learning to optimise for sizing of these pair trades. However since there was no significant drawdowns the returns are linearly increasing with investment sizing i.e. greater nominal investment in the the pair trade the proportionate increase in returns without realising significant drawdown risk.

Instead of optimising for sizing, we can explore Machine Learning in terms of strategy on this stationary dataset. Whereas our prescribed strategy is to enter at +/- 1 std dev, exit at 0 with +/- 2 std dev stop loss. These are only suggestions and arbitrary levels.

With Machine Learning, we can discover if it will uncover the mean reverting nature and recommend another threshhold. We use Q Learner to understand state space with the same spread, mid, std dev parameters as the baseline.

### Q Value table

In [13]:
workingPairOutcome[top_keys[0]][261]

array([0, 0, 1, 0, 0, 0])

In [14]:
df = pairsOutcome[top_keys[0]]
df.tail()

Unnamed: 0_level_0,spread,mid,1sd high,1sd low,2sd high,2sd low,position,5401 JP Equity position,8604 JP Equity position,dailypnl,cumpnl
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-05-27,-212.964671,640.517283,1060.31034,220.724226,1480.103398,-199.068832,0,0,0,-0.0,0.013071
2024-05-28,-205.20412,636.742095,1059.676537,213.807653,1482.610979,-209.126789,1,1,-1,-0.004026,0.009045
2024-05-29,-219.190378,632.926338,1059.056222,206.796454,1485.186105,-219.33343,1,1,-1,0.018654,0.027699
2024-05-30,-157.208034,629.200068,1057.956382,200.443753,1486.712697,-228.312562,1,1,-1,-0.018764,0.008935
2024-05-31,-216.067718,625.625385,1057.51041,193.740359,1489.395436,-238.144667,1,1,-1,,


In [15]:
ls_pos = [-1,0,1]
pair = validPairsList[0]

#Calculate PnL
for pos in ls_pos:
    df[f'dailypnl_{pos}'] = pos*return_df[f'{pair[0]}'].shift(-1) + -pos*return_df[f'{pair[1]}'].shift(-1)

In [16]:
df

Unnamed: 0_level_0,spread,mid,1sd high,1sd low,2sd high,2sd low,position,5401 JP Equity position,8604 JP Equity position,dailypnl,cumpnl,dailypnl_-1,dailypnl_0,dailypnl_1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-01-01,54.852000,,,,,,0,0,0,0.000000,0.000000,0.000000,0.0,0.000000
2013-01-02,54.852000,,,,,,0,0,0,0.000000,0.000000,0.000000,0.0,0.000000
2013-01-03,54.852000,,,,,,0,0,0,0.000000,0.000000,0.003654,0.0,-0.003654
2013-01-04,59.631187,,,,,,0,0,0,-0.000000,0.000000,-0.040444,-0.0,0.040444
2013-01-07,132.761718,,,,,,0,0,0,-0.000000,0.000000,-0.020861,-0.0,0.020861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-27,-212.964671,640.517283,1060.310340,220.724226,1480.103398,-199.068832,0,0,0,-0.000000,0.013071,-0.002324,-0.0,0.002324
2024-05-28,-205.204120,636.742095,1059.676537,213.807653,1482.610979,-209.126789,1,1,-1,-0.004026,0.009045,0.004026,-0.0,-0.004026
2024-05-29,-219.190378,632.926338,1059.056222,206.796454,1485.186105,-219.333430,1,1,-1,0.018654,0.027699,-0.018654,-0.0,0.018654
2024-05-30,-157.208034,629.200068,1057.956382,200.443753,1486.712697,-228.312562,1,1,-1,-0.018764,0.008935,0.018764,0.0,-0.018764


In [45]:
ls_pos = [-1,0,1]
ls_res_111000 = []
ls_res_011000 = []
ls_res_001000 = []
ls_res_000100 = []
ls_res_000110 = []
ls_res_000111 = []
for i, key in enumerate(top_keys):
    pair = validPairsList[i]
    df = pairsOutcome[key]
    for pos in ls_pos:
        df[f'dailypnl_{pos}'] = pos*return_df[f'{pair[0]}'].shift(-1) + -pos*return_df[f'{pair[1]}'].shift(-1)

    # above 2 high SD
    ls_res_111000.append(df.loc[(df['spread'] > df['2sd high'])][['dailypnl_-1', 'dailypnl_0', 'dailypnl_1']].sum())
    # within 1 to 2 high SD
    ls_res_011000.append(df.loc[(df['spread'] > df['1sd high']) & (df['spread'] < df['2sd high'])][['dailypnl_-1', 'dailypnl_0', 'dailypnl_1']].sum())
    # within 0 to 1 high SD
    ls_res_001000.append(df.loc[(df['spread'] > df['mid']) & (df['spread'] < df['1sd high'])][['dailypnl_-1', 'dailypnl_0', 'dailypnl_1']].sum())
    # within 0 to 1 high SD
    ls_res_000100.append(df.loc[(df['spread'] < df['mid']) & (df['spread'] > df['1sd low'])][['dailypnl_-1', 'dailypnl_0', 'dailypnl_1']].sum())
    # within 1 to 2 low SD
    ls_res_000110.append(df.loc[(df['spread'] > df['2sd low']) & (df['spread'] < df['1sd low'])][['dailypnl_-1', 'dailypnl_0', 'dailypnl_1']].sum())
    # less than 2sd spread
    ls_res_000111.append(df.loc[(df['spread'] < df['2sd low'])][['dailypnl_-1', 'dailypnl_0', 'dailypnl_1']].sum())
    
ls_ls = [ls_res_111000,ls_res_011000, ls_res_001000,ls_res_000100,ls_res_000110,ls_res_000111]
ls_state_returns = []
for i, ls_ in enumerate(ls_ls):
    dummy_df = pd.concat(ls_, axis=1).T
    dummy_df['max return'] = dummy_df.idxmax(axis=1)
    ls_state_returns.append(dummy_df)


In [59]:
states = ['111000','011000', '001000','000100','000110','000111']
for state, dummy_df in zip(states,ls_state_returns):
    print(f"Sum of returns for state {state}")
    print(dummy_df[dummy_df.columns[:-1]].sum(axis=0),"\n")

Sum of returns for state 111000
dailypnl_-1    4.172828
dailypnl_0     0.000000
dailypnl_1    -4.172828
dtype: float64 

Sum of returns for state 011000
dailypnl_-1    4.278139
dailypnl_0     0.000000
dailypnl_1    -4.278139
dtype: float64 

Sum of returns for state 001000
dailypnl_-1    0.617806
dailypnl_0     0.000000
dailypnl_1    -0.617806
dtype: float64 

Sum of returns for state 000100
dailypnl_-1   -0.569964
dailypnl_0     0.000000
dailypnl_1     0.569964
dtype: float64 

Sum of returns for state 000110
dailypnl_-1   -4.939772
dailypnl_0     0.000000
dailypnl_1     4.939772
dtype: float64 

Sum of returns for state 000111
dailypnl_-1   -2.696878
dailypnl_0     0.000000
dailypnl_1     2.696878
dtype: float64 



The above results show the reliability of mean reversion. In each state space we expect the returns to reward position that expect a return to mean.

However, although this trend is true in aggregate, there are some pairs where this rule does not hold. Below shows a sample of returns where the 2 of 17 pair trades have a greater return going long (instead of short) when the spread is over 2 SD.

In [62]:
dummy_df = ls_state_returns[0]
dummy_df

Unnamed: 0,dailypnl_-1,dailypnl_0,dailypnl_1,max return
0,0.12958,0.0,-0.12958,dailypnl_-1
1,0.416272,0.0,-0.416272,dailypnl_-1
2,0.161907,0.0,-0.161907,dailypnl_-1
3,0.235482,0.0,-0.235482,dailypnl_-1
4,-0.036494,0.0,0.036494,dailypnl_1
5,0.292418,0.0,-0.292418,dailypnl_-1
6,-0.100397,0.0,0.100397,dailypnl_1
7,0.348275,0.0,-0.348275,dailypnl_-1
8,-0.057949,0.0,0.057949,dailypnl_1
9,0.238607,0.0,-0.238607,dailypnl_-1


# checking state imbalances

In [72]:
from collections import Counter

In [76]:
array = np.concatenate([workingPairOutcome[key] for key in top_keys])

In [78]:
row_tuples = map(tuple, array)

# Count unique rows
row_counts = Counter(row_tuples)

# Print results
for row, count in row_counts.items():
    print(f"State: {row}, Count: {count}")

State: (0, 0, 0, 0, 0, 0), Count: 4437
State: (0, 0, 1, 0, 0, 0), Count: 12104
State: (0, 1, 1, 0, 0, 0), Count: 11951
State: (1, 1, 1, 0, 0, 0), Count: 2754
State: (0, 0, 0, 1, 0, 0), Count: 7582
State: (0, 0, 0, 1, 1, 0), Count: 7990
State: (0, 0, 0, 1, 1, 1), Count: 3825


In [89]:
# Convert to DataFrame
dummy_df = pd.DataFrame(row_counts.items(), columns=["State", "Count"],index=None)
dummy_df.sort_values(by='Count',ascending=0)

Unnamed: 0,State,Count
1,"(0, 0, 1, 0, 0, 0)",12104
2,"(0, 1, 1, 0, 0, 0)",11951
5,"(0, 0, 0, 1, 1, 0)",7990
4,"(0, 0, 0, 1, 0, 0)",7582
0,"(0, 0, 0, 0, 0, 0)",4437
6,"(0, 0, 0, 1, 1, 1)",3825
3,"(1, 1, 1, 0, 0, 0)",2754
