In [1]:
## anaconda3 (Python 3.12.0) Kernel
import numpy as np
import csv
# pair trade packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from datetime import datetime
import numpy as np
from collections import defaultdict

# Load Pairs Data


In [2]:
def custom_date_parser(date_str):
    return datetime.strptime(date_str, '%d/%m/%Y')

with open('pairsOutcome.pkl', 'rb') as file:
    pairsOutcome = pickle.load(file)

tpxData = pd.read_csv('TPX_prices.csv', index_col=0, parse_dates=True, date_parser=custom_date_parser)
tpxData = tpxData.dropna(axis='columns')
return_df = (tpxData / tpxData.shift(1)) - 1

  tpxData = pd.read_csv('TPX_prices.csv', index_col=0, parse_dates=True, date_parser=custom_date_parser)


# Get Pair Trade Portfolio
`pairsOutcome` already have TOPIX stocks with highest liquidity and are tested for stationarity over a 1 year window

Choose top 10 known pair trades by returns in the total dataset

In [3]:
with open("output_clustering.csv", 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    working_pairs = [tuple(row) for row in reader]

top_keys = [f"{pair[0]} {pair[1]}" for pair in working_pairs]
valid = pd.read_csv('validPairs5.csv', 
                    index_col=0, 
                    parse_dates=True, 
                    date_parser=custom_date_parser)
validPairsList = [
    [item.strip() + ' Equity' for item in pair.split('Equity') if item.strip()]
    for pair in top_keys
]
rollingWindow = 262
cutLossSd = 2

for pair in validPairsList:
    df = pd.DataFrame()

    #Calculate Standard Deviations
    df['spread'] = valid[f'spread_{pair[0]}_{pair[1]}']
    df['mid'] =  df['spread'].rolling(rollingWindow).mean()
    df['1sd high'] = df['spread'].rolling(rollingWindow).mean() + df['spread'].rolling(rollingWindow).std()
    df['1sd low'] = df['spread'].rolling(rollingWindow).mean() - df['spread'].rolling(rollingWindow).std()
    df['2sd high'] = df['spread'].rolling(rollingWindow).mean() + df['spread'].rolling(rollingWindow).std() * cutLossSd
    df['2sd low'] = df['spread'].rolling(rollingWindow).mean() - df['spread'].rolling(rollingWindow).std() * cutLossSd
    df['position'] = 0

    df.loc[(df['spread'] > df['1sd high']) & (df['spread'] < df['2sd high']), 'position'] = -1
    df.loc[(df['spread']< df['1sd low']) & (df['spread'] > df['2sd low']), 'position'] = 1

    #Calculate PnL
    df[f'{pair[0]} position'] = df['position']
    df[f'{pair[1]} position'] = df['position'] * -1
    df['dailypnl'] = df[f'{pair[1]} position']*return_df[f'{pair[1]}'].shift(-1) + df[f'{pair[0]} position']*return_df[f'{pair[0]}'].shift(-1)
    df['cumpnl'] = df['dailypnl'].cumsum()

    pairsOutcome[f'{pair[0]} {pair[1]}'] = df

  valid = pd.read_csv('validPairs5.csv',


## Make indicators and spread stationary around 0
Deduct the mean from all values to translate to 0 axis

In [4]:
workingPairOutcome = {}

for pair in top_keys:
    dummy_df = pairsOutcome[top_keys[0]].iloc[::,:6]
    dummy_df = dummy_df.subtract(dummy_df['mid'], axis=0).drop(columns=['mid']) # centre spread and SD
    dummy_df = dummy_df.div(dummy_df['2sd high']-dummy_df['1sd high'],axis=0)   # express SD as integers, give spread as propotionate
    dummy_df['2sd_high_boolean'] = (dummy_df['spread']>dummy_df['2sd high']).astype(int)
    dummy_df['1sd_high_boolean'] = (dummy_df['spread']>dummy_df['1sd high']).astype(int)
    dummy_df['0sd_high_boolean'] = (dummy_df['spread']>0).astype(int)
    dummy_df['0sd_low_boolean']  = (dummy_df['spread']<0).astype(int)
    dummy_df['1sd_low_boolean']  = (dummy_df['spread']<dummy_df['1sd low'] ).astype(int)
    dummy_df['2sd_low_boolean']  = (dummy_df['spread']<dummy_df['2sd low'] ).astype(int)
    dummy_df = dummy_df.drop(columns=['spread','1sd high', '1sd low', '2sd high', '2sd low'])
    workingPairOutcome[pair] = dummy_df.to_numpy()

workingPairOutcome[top_keys[5]][-5:]

array([[0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 0, 0]])

In [5]:
# due to spikey-ness of reward around 0, scale from -1 to 1 and give more rewards above 1, 2, 3
dailypnl_sd = 0.018390515013803736 # see /Users/ju/Projects/00_SMU/mqf_practice/QF634_Applied_Quantitative_Research_Methods/QF634 Project/Pair Trading/03_project_state_space_analysis.ipynb for SD derivation

# Machine Learning Challenge

## Background
Initial evaluation of the baseline portfolio shows that draw downs are small. Originally team had the idea of using Machine Learning to optimise for sizing of these pair trades. However since there was no significant drawdowns the returns are linearly increasing with investment sizing i.e. greater nominal investment in the the pair trade the proportionate increase in returns without realising significant drawdown risk.

Instead of optimising for sizing, we can explore Machine Learning in terms of strategy on this stationary dataset. Whereas our prescribed strategy is to enter at +/- 1 std dev, exit at 0 with +/- 2 std dev stop loss. These are only suggestions and arbitrary levels.

With Machine Learning, we can discover if it will uncover the mean reverting nature and recommend another threshhold. We use Q Learner to understand state space with the same spread, mid, std dev parameters as the baseline.

### Q Value table

In [6]:
def get_baseline(env, trained_agent, max_steps_per_episode, pair_idx):
    env.reset(pair_idx)
    total_reward = 0
    current_step = 261
    env.current_step = current_step
    env.last_step = 2868

    for step in range(max_steps_per_episode):
        state = workingPairOutcome[top_keys[pair_idx]][current_step+step]
        state_index = env.valid_states.index(tuple(state))
        action_index = np.argmax(trained_agent.Q[state_index])
        action = trained_agent.index_to_action[action_index]
        _, reward, done = env.step(action,pair_idx)
        total_reward += reward
            
        if done:
            break

    print(f"{pair_idx+1}. {top_keys[pair_idx]}: Reward: {total_reward:.5f}, step {step}")
    return total_reward

class PairTradeEnv1:
    def __init__(self):
        # Initialize environment variables and parameters
        self.num_actions = 3
        self.valid_states = [
            (0, 0, 0, 1, 0, 0),
            (0, 0, 0, 1, 1, 0),
            (0, 0, 0, 1, 1, 1),
            (0, 0, 1, 0, 0, 0),
            (0, 1, 1, 0, 0, 0),
            (1, 1, 1, 0, 0, 0),
            (0, 0, 0, 0, 0, 0),
        ]  # Define valid states
        self.state_mapping = {
            i: state for i, state in enumerate(self.valid_states)
        }
        self.earliest_step = 261  # Hot start
        self.last_step = 2868

        self.state = np.zeros(6)
        self.current_step = self.earliest_step

    def reset(self, pair_idx):
        # Reset the environment to its initial state
        self.current_step = self.earliest_step
        self.state = workingPairOutcome[top_keys[pair_idx]][self.current_step]
        return self.state

    def step(self, action, pair_idx):
        # Advance the time step
        self.current_step += 1
        next_state = workingPairOutcome[top_keys[pair_idx]][self.current_step]
        reward = self.calculate_reward(action, pair_idx)
        done = self.current_step >= self.last_step
        return next_state, reward, done

    def calculate_reward(self, position, pair_idx):
        """
        Calculate reward based on the previous day's return.
        """
        pair = validPairsList[pair_idx]
        position_0 = position
        position_1 = position * -1
        dailypnl = position_0 * return_df[f'{pair[0]}'].iloc[self.current_step] \
            + position_1 * return_df[f'{pair[1]}'].iloc[self.current_step]

        return dailypnl

class Agent:
    def __init__(self, env, num_states, num_actions, batch_size, alpha=0.1, gamma=0.9, epsilon=0.5, epsilon_decay=0.95, buffer_size = 10000):
        self.env = env
        self.num_actions = num_actions
        self.num_states = num_states
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = 0.95
        # set indices for q table
        self.action_to_index = {-1: 0, 0: 1, 1: 2}
        self.index_to_action = {0: -1, 1: 0, 2: 1}
        self.Q = np.zeros((num_states, num_actions))
        # replay memory
        self.replay_buffer = []
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.state_counts = np.zeros(len(self.env.valid_states))
        self.batch = []

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            action_index = np.random.randint(self.num_actions)
        else:
            state_index = self.env.valid_states.index(tuple(state))
            action_index = np.argmax(self.Q[state_index])
        return self.index_to_action[action_index]

    def update_Q_from_buffer(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        # get counts of each state
        state_counts = np.bincount([state_index for state_index, _, _, _ in self.replay_buffer], 
                                   minlength=len(self.env.valid_states))
        state_probabilities = state_counts / state_counts.sum()
        # equally represent all states in a batch
        batch = []
        for _ in range(self.batch_size):
            sampled_state_index = np.random.choice(len(self.env.valid_states), p=state_probabilities)
            filtered_transitions = [transition for transition in self.replay_buffer if transition[0] == sampled_state_index]
            if not filtered_transitions:
                return  
            batch.append(filtered_transitions[np.random.choice(len(filtered_transitions))])
        # train based on resampled batch
        for state_index, action, reward, next_state_index in batch:
            reward /= dailypnl_sd
            # if reward > 2:
            #     reward +=2
            # elif reward > 0.5:
            #     reward +=1
            # elif reward < -2:
            #     reward += -2
            # elif reward < -0.5:
            #     reward += -1
            self.Q[state_index, self.action_to_index[action]] += self.alpha * (
                reward + self.gamma * np.max(self.Q[next_state_index])
                - self.Q[state_index, self.action_to_index[action]]
            )
            self.state_counts[state_index] += 1

    def learn(self, num_episodes):
        step_count = 0
        for episode in range(num_episodes):
            for pair_idx in range(len(top_keys)):
                state = self.env.reset(pair_idx)
                done = False

                while not done:
                    action = self.choose_action(state)

                    next_state, reward, done = self.env.step(action, pair_idx)
                    state_index = self.env.valid_states.index(tuple(state))
                    next_state_index = self.env.valid_states.index(tuple(next_state))

                    self.replay_buffer.append((state_index, action, reward, next_state_index))
                    if len(self.replay_buffer) > self.buffer_size:
                        self.replay_buffer.pop(0)

                    if step_count%self.batch_size==0:
                        self.update_Q_from_buffer()
                    
                    step_count += 1
                    state = next_state
            self.epsilon = max(0.3, self.epsilon*self.epsilon_decay)
            if episode%20==0:
                print(self.epsilon)
            


# Gamma = 0

In [7]:
num_epochs = 3
num_episodes = 10
ls_epoch_reward = []
gamma = 0

for epoch in range(num_epochs):
    # instantiate
    env = PairTradeEnv1()
    agent = Agent(env=env,
                num_states=len(env.valid_states), 
                num_actions=env.num_actions, 
                batch_size= len(top_keys)*100, 
                alpha=0.5, 
                gamma=gamma, 
                epsilon_decay=0.995,
                epsilon=1, 
                buffer_size = 10000)
    # train
    agent.learn(num_episodes)
    # evaluate
    total_train_return = 0
    for idx, _ in enumerate(top_keys):
        total_train_return += get_baseline(env, agent, 3000, pair_idx=idx)

    ls_epoch_reward.append(total_train_return/len(top_keys))

print(f"Epoch Total Ave Reward: {ls_epoch_reward}")
print(f"Epoch Ave Ave Reward: \t{np.array(ls_epoch_reward).mean():.5f}")
print(f"Epoch SD  Ave Reward: \t{np.array(ls_epoch_reward).std():.5f} ")

0.95
1. 6503 JP Equity 7269 JP Equity: Reward: 2.11578, step 2606
2. 6326 JP Equity 6954 JP Equity: Reward: -0.01672, step 2606
3. 8053 JP Equity 8058 JP Equity: Reward: -0.05075, step 2606
4. 4901 JP Equity 9613 JP Equity: Reward: -1.14694, step 2606
5. 6988 JP Equity 7267 JP Equity: Reward: -0.07786, step 2606
6. 4901 JP Equity 6702 JP Equity: Reward: -0.37365, step 2606
7. 4684 JP Equity 7832 JP Equity: Reward: -0.39928, step 2606
8. 7267 JP Equity 8306 JP Equity: Reward: -0.56660, step 2606
9. 7267 JP Equity 8801 JP Equity: Reward: -0.70480, step 2606
10. 4519 JP Equity 7532 JP Equity: Reward: -0.27370, step 2606
0.95
1. 6503 JP Equity 7269 JP Equity: Reward: 1.01645, step 2606
2. 6326 JP Equity 6954 JP Equity: Reward: 0.78693, step 2606
3. 8053 JP Equity 8058 JP Equity: Reward: -1.09874, step 2606
4. 4901 JP Equity 9613 JP Equity: Reward: 0.40968, step 2606
5. 6988 JP Equity 7267 JP Equity: Reward: 0.58956, step 2606
6. 4901 JP Equity 6702 JP Equity: Reward: -0.17914, step 2606
7.

In [8]:
df = pd.DataFrame(agent.Q, index=env.valid_states, columns=[-1, 0, 1])
df['mean_reversion'] = [0,1,0,0,-1,0,0]
df['position'] = df[[-1, 0, 1]].idxmax(axis=1)

df.to_csv(f"q_table_gamma{agent.gamma}_{agent.gamma}.csv")
df

Unnamed: 0,-1,0,1,mean_reversion,position
"(0, 0, 0, 1, 0, 0)",-0.315604,0.0,-0.062683,0,0
"(0, 0, 0, 1, 1, 0)",-0.336948,0.0,-0.260248,1,0
"(0, 0, 0, 1, 1, 1)",0.44324,0.0,-0.024074,0,-1
"(0, 0, 1, 0, 0, 0)",0.873375,0.0,-0.486601,0,-1
"(0, 1, 1, 0, 0, 0)",0.387037,0.0,-0.438765,-1,-1
"(1, 1, 1, 0, 0, 0)",0.155044,0.0,0.04111,0,-1
"(0, 0, 0, 0, 0, 0)",0.0,0.0,0.0,0,-1


In [9]:
dict_policy_actions = dict(df.position)
dict_policy_actions

{(0, 0, 0, 1, 0, 0): 0,
 (0, 0, 0, 1, 1, 0): 0,
 (0, 0, 0, 1, 1, 1): -1,
 (0, 0, 1, 0, 0, 0): -1,
 (0, 1, 1, 0, 0, 0): -1,
 (1, 1, 1, 0, 0, 0): -1,
 (0, 0, 0, 0, 0, 0): -1}

table is taking a long time to generalise. using one pair is not good enough to get any poistioning as everything is flat. only after adding all 10 pairs then performance churns out shorting on 1 SD high cross.

Experiment: try making gamma 0.1, Q table should closely mimic the state space analysis table with strong mean reversion tendency. This is not what we see. and it even has Q values opposite to state space.

In [10]:
workingPairQresults = {}
for pair_idx in range(len(top_keys)):
    df = pd.DataFrame(workingPairOutcome[top_keys[pair_idx]], columns=dummy_df.columns)

    # Assign policy values using the dictionary
    df['position'] = df.apply(lambda row: dict_policy_actions.get(tuple(row), np.nan), axis=1)
    df[df.isna().any(axis=1)]
    pair = validPairsList[pair_idx]
    #Calculate PnL
    df[f'{pair[0]} position'] = df['position']
    df[f'{pair[1]} position'] = df['position'] * -1
    df['dailypnl'] = df[f'{pair[1]} position'].values*return_df[f'{pair[1]}'].shift(-1).values \
                    + df[f'{pair[0]} position'].values*return_df[f'{pair[0]}'].shift(-1).values
    df['cumpnl'] = df['dailypnl'].cumsum()
    workingPairQresults[top_keys[pair_idx]] = df

total_q_return = 0
total_train_q_return = 0
total_test_q_return = 0

for pair in top_keys:
    total_q_return += workingPairQresults[pair]['cumpnl'].iloc[-2]
    total_train_q_return += workingPairQresults[pair]['cumpnl'].iloc[2868-2]

print("="*10, "Absolute", "="*10)
print(f"total return \t\t{total_q_return/len(top_keys):.5f}")
print(f"total train return \t{total_train_q_return/len(top_keys):.5f}")
print(f"total test return \t{(total_q_return - total_train_q_return)/len(top_keys):.5f}")
print("="*10, "Normalised", "="*10)
print(f"total return \t\t{total_q_return/len(top_keys)/(2979-261):.5f}")
print(f"total train return \t{total_train_q_return/len(top_keys)/(2868-261):.5f}")
print(f"total test return \t{(total_q_return - total_train_q_return)/len(top_keys)/(2979-2868):.5f}")

total return 		-0.04015
total train return 	-0.08285
total test return 	0.04270
total return 		-0.00001
total train return 	-0.00003
total test return 	0.00038


# Gamma > 0 

In [12]:
num_epochs = 3
num_episodes = 10

ls_epoch_reward = []

for epoch in range(num_epochs):
    # instantiate
    env = PairTradeEnv1()
    agent = Agent(env=env,
                num_states=len(env.valid_states), 
                num_actions=env.num_actions, 
                batch_size= len(top_keys)*100, 
                alpha=0.5, 
                gamma=0.5, 
                epsilon=1,
                epsilon_decay=0.995,
                buffer_size = 10000)
    print(agent.gamma, env.current_step)
    # train
    agent.learn(num_episodes)
    # evaluate
    total_train_return = 0
    for idx, _ in enumerate(top_keys):
        total_train_return += get_baseline(env, agent, 3000, pair_idx=idx)

    ls_epoch_reward.append(total_train_return/len(top_keys))

print(f"Epoch Total Ave Reward: {ls_epoch_reward}")
print(f"Epoch Ave Ave Reward: \t{np.array(ls_epoch_reward).mean():.5f}")
print(f"Epoch SD  Ave Reward: \t{np.array(ls_epoch_reward).std():.5f} ")

0.5 261
0.95
1. 6503 JP Equity 7269 JP Equity: Reward: -0.12023, step 2606
2. 6326 JP Equity 6954 JP Equity: Reward: 0.39556, step 2606
3. 8053 JP Equity 8058 JP Equity: Reward: -0.82107, step 2606
4. 4901 JP Equity 9613 JP Equity: Reward: 1.11480, step 2606
5. 6988 JP Equity 7267 JP Equity: Reward: -0.17254, step 2606
6. 4901 JP Equity 6702 JP Equity: Reward: 1.00307, step 2606
7. 4684 JP Equity 7832 JP Equity: Reward: -0.11127, step 2606
8. 7267 JP Equity 8306 JP Equity: Reward: 0.44704, step 2606
9. 7267 JP Equity 8801 JP Equity: Reward: 0.56159, step 2606
10. 4519 JP Equity 7532 JP Equity: Reward: -0.37529, step 2606
0.5 261
0.95
1. 6503 JP Equity 7269 JP Equity: Reward: 1.01525, step 2606
2. 6326 JP Equity 6954 JP Equity: Reward: 0.69300, step 2606
3. 8053 JP Equity 8058 JP Equity: Reward: -0.43573, step 2606
4. 4901 JP Equity 9613 JP Equity: Reward: -0.22818, step 2606
5. 6988 JP Equity 7267 JP Equity: Reward: 0.44917, step 2606
6. 4901 JP Equity 6702 JP Equity: Reward: -1.12489,

### 14 Dec
- experiment with running multiple gamma. 
- ls_gamma = [.5, .1, .2, .9, 0]
    - exp1: Epoch Total Ave Reward: [0.27656047941275835, -0.0678991421108197, -0.08469468350312535, -0.1809340042907028, -0.2585470411630025]
    - exp2: Epoch Total Ave Reward: [0.21300592294405787, 0.31927383581329816, -0.19477593352845785, -0.176282995815822, -0.  042197878268430175]
- ls_gamma = [.9, 0, .5, .1, .2]
    - Epoch Total Ave Reward: [0.31927383581329816, -0.12529959493439907, 0.31927383581329816, -0.2642522721225163, 0.20685181929893712]
- settle on gamma 0.5 . try reward clipping
- increasing reward ||| if reward > 2: reward += 2, elif reward > 1: reward += 1
    - Epoch Total Ave Reward: [-0.01642047141259907, -0.13440731073407086, 0.07929587537637263, 0.2268478521818123, 0.1095582760267271]
    - Epoch Ave Ave Reward: 	0.05297
    - Epoch SD  Ave Reward: 	0.12170 
- clipping reward ||| reward /= dailypnl_sd, reward = np.clip(reward,-1,1) 
    - Epoch Total Ave Reward: [-0.11145766569664421, -0.3913613850599167, 0.14032909285942924, -0.1606315485750662, 0.042197878268430175]
    - Epoch Ave Ave Reward: 	-0.09618
    - Epoch SD  Ave Reward: 	0.18254 
- increase reward and penalty ||| , reward /= dailypnl_sd
            if reward > 2:
                reward += 2
            elif reward > 1:
                reward += 1
            elif reward < -2:
                reward -= 2
            elif reward < -1:
                reward -= 1
    - Epoch Total Ave Reward: [0.21300592294405787, 0.1651926853830304, 0.33396093223633744, -0.1651926853830304, 0.062178140797388294]
    - Epoch Ave Ave Reward: 	0.12183
    - Epoch SD  Ave Reward: 	0.16801 
- combine increase reward and clipping ||| 
            reward /= dailypnl_sd
            if reward > 2:
                reward = 2
            elif reward > .5:
                reward += 1
            elif reward < -2:
                reward = -2
            elif reward < -.5:
                reward += -1
    - exp1 | Epoch Total Ave Reward: [-0.07351450140341671, 0.1760664446499776, -0.20093003717357863, 0.22364736069124266, -0.12529959493439907]
    - Epoch Ave Ave Reward: 	-0.00001
    - Epoch SD  Ave Reward: 	0.16882 
    - exp2 | Epoch Total Ave Reward: [0.23802803428152103, 0.08925582031108967, -0.04759668639518292, 0.03709799710794239, 0.04790310922794429]
    - Epoch Ave Ave Reward: 	0.07294
    - Epoch SD  Ave Reward: 	0.09377 
- combine clipping, also reward shape if further from 0, remove noise
            reward /= dailypnl_sd
            if reward > .1:
                reward = max(2,reward+1)
            elif reward < -.1:
                reward = min(-2, reward-1)
    - bad results
- revert to earlier increase reward experiment and also set alpha to 0.5. later run alpha experiment
    - reward /= dailypnl_sd
            if reward > 2:
                reward +=2
            elif reward > 0.5:
                reward +=1
            elif reward < -2:
                reward += -2
            elif reward < -0.5:
                reward += -1
    - Epoch Total Ave Reward: [0.062178140797388294, 0.18198822677533613, -0.1245877739517566]
    - Epoch Ave Ave Reward: 	0.03986
    - Epoch SD  Ave Reward: 	0.12615 
---
- rerun all with newest reward function
    - reward /= dailypnl_sd
            if reward > 2:
                reward +=2
            elif reward > 0.5:
                reward +=1
            elif reward < -2:
                reward += -2
            elif reward < -0.5:
                reward += -1
    - Epoch Total Ave Reward: [0.05321204568778015, -0.06916991576129747, -0.06925978742821415]
    - Epoch Ave Ave Reward: 	-0.02841
    - Epoch SD  Ave Reward: 	0.05771
    - really bad returns on both gamma =0 and >0. alpha was lower in =0.
    - might be under fitting the reward. maybe clip the punishment and leave the rewards
- rerun with removing the negative returns
    - not good result
- rerun with no reward scaling, only normalisation
    - small bactch run
    - gamma =0 || Epoch Total Ave Reward: [-0.14945136647535748, -0.04198132710258564, -0.09593289795481606]
        - Epoch Ave Ave Reward: 	-0.09579
        - Epoch SD  Ave Reward: 	0.04387 
    - gamma <0 || Epoch Total Ave Reward: [0.1921647228758977, 0.1920243147234945, 0.02189542255279333]
        - Epoch Ave Ave Reward: 	0.13536
        - Epoch SD  Ave Reward: 	0.08023

In [None]:
df = pd.DataFrame(agent.Q, index=env.valid_states, columns=[-1, 0, 1])
df['mean_reversion'] = [0,1,0,0,-1,0,0]
df['position'] = df[[-1, 0, 1]].idxmax(axis=1)



df.to_csv(f"q_table_gamma{agent.gamma}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv")
df

Unnamed: 0,-1,0,1,mean_reversion,position
"(0, 0, 0, 1, 0, 0)",0.150154,0.297776,-0.337173,0,0
"(0, 0, 0, 1, 1, 0)",0.100002,0.281074,0.753829,1,1
"(0, 0, 0, 1, 1, 1)",-0.598842,0.217815,-0.32986,0,0
"(0, 0, 1, 0, 0, 0)",0.346429,0.566151,0.516133,0,0
"(0, 1, 1, 0, 0, 0)",-0.048844,0.622264,1.765002,-1,1
"(1, 1, 1, 0, 0, 0)",-0.047397,0.142649,-0.169307,0,0
"(0, 0, 0, 0, 0, 0)",0.0,0.0,0.0,0,-1


In [14]:
dict_policy_actions = dict(df.position)
dict_policy_actions

{(0, 0, 0, 1, 0, 0): 0,
 (0, 0, 0, 1, 1, 0): 1,
 (0, 0, 0, 1, 1, 1): 0,
 (0, 0, 1, 0, 0, 0): 0,
 (0, 1, 1, 0, 0, 0): 1,
 (1, 1, 1, 0, 0, 0): 0,
 (0, 0, 0, 0, 0, 0): -1}

table is taking a long time to generalise. using one pair is not good enough to get any poistioning as everything is flat. only after adding all 10 pairs then performance churns out shorting on 1 SD high cross.

Experiment: try making gamma 0.1, Q table should closely mimic the state space analysis table with strong mean reversion tendency. This is not what we see. and it even has Q values opposite to state space.

In [15]:
workingPairQresults = {}
for pair_idx in range(len(top_keys)):
    df = pd.DataFrame(workingPairOutcome[top_keys[pair_idx]], columns=dummy_df.columns)

    # Assign policy values using the dictionary
    df['position'] = df.apply(lambda row: dict_policy_actions.get(tuple(row), np.nan), axis=1)
    df[df.isna().any(axis=1)]
    pair = validPairsList[pair_idx]
    #Calculate PnL
    df[f'{pair[0]} position'] = df['position']
    df[f'{pair[1]} position'] = df['position'] * -1
    df['dailypnl'] = df[f'{pair[1]} position'].values*return_df[f'{pair[1]}'].shift(-1).values \
                    + df[f'{pair[0]} position'].values*return_df[f'{pair[0]}'].shift(-1).values
    df['cumpnl'] = df['dailypnl'].cumsum()
    workingPairQresults[top_keys[pair_idx]] = df

total_q_return = 0
total_train_q_return = 0
total_test_q_return = 0

for pair in top_keys:
    total_q_return += workingPairQresults[pair]['cumpnl'].iloc[-2]
    total_train_q_return += workingPairQresults[pair]['cumpnl'].iloc[2868-2]

print("="*10, "Absolute", "="*10)
print(f"total return \t\t{total_q_return/len(top_keys):.5f}")
print(f"total train return \t{total_train_q_return/len(top_keys):.5f}")
print(f"total test return \t{(total_q_return - total_train_q_return)/len(top_keys):.5f}")
print("="*10, "Normalised", "="*10)
print(f"total return \t\t{total_q_return/len(top_keys)/(2979-261):.5f}")
print(f"total train return \t{total_train_q_return/len(top_keys)/(2868-261):.5f}")
print(f"total test return \t{(total_q_return - total_train_q_return)/len(top_keys)/(2979-2868):.5f}")

total return 		0.00195
total train return 	0.03629
total test return 	-0.03433
total return 		0.00000
total train return 	0.00001
total test return 	-0.00031
