In [2]:
## anaconda3 (Python 3.12.0) Kernel

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# pair trade packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from datetime import datetime

# Load Pairs Data


In [3]:
def custom_date_parser(date_str):
    return datetime.strptime(date_str, '%d/%m/%Y')

# Load the dictionary from the pickle file
with open('pairsOutcome.pkl', 'rb') as file:
    pairsOutcome = pickle.load(file)

print("Dictionary loaded from pairsOutcome.pkl")


# Load stock data and get return 
tpxData = pd.read_csv('TPX_prices.csv', index_col=0, parse_dates=True, date_parser=custom_date_parser)
tpxData = tpxData.dropna(axis='columns')
return_df = (tpxData / tpxData.shift(1)) - 1

Dictionary loaded from pairsOutcome.pkl


  tpxData = pd.read_csv('TPX_prices.csv', index_col=0, parse_dates=True, date_parser=custom_date_parser)


# Get Pair Trade Portfolio
`pairsOutcome` already have TOPIX stocks with highest liquidity and are tested for stationarity over a 1 year window

Choose top 10 known pair trades by returns in the total dataset

In [4]:
# Sort the keys by their cumpnl[-2] values in descending order
top_keys = sorted(
    pairsOutcome,
    key=lambda k: pairsOutcome[k].cumpnl.iloc[-2],  # Access cumpnl[-2] safely
    reverse=True
)[:10]  # Get the top 10 keys

# Print the top 10 performing trades
print("Top 10 performing trades:")
for i, key in enumerate(top_keys, 1):
    print(f"{i}. Key: {key}, Value: {pairsOutcome[key].cumpnl.iloc[-2]}")

Top 10 performing trades:
1. Key: 1801 JP Equity 2670 JP Equity, Value: 2.5797887367591246
2. Key: 3778 JP Equity 6701 JP Equity, Value: 2.537242032391529
3. Key: 2760 JP Equity 6254 JP Equity, Value: 2.3688208386917404
4. Key: 5706 JP Equity 6954 JP Equity, Value: 2.2676474298290237
5. Key: 7951 JP Equity 9684 JP Equity, Value: 2.0657325467200596
6. Key: 1808 JP Equity 6481 JP Equity, Value: 1.9929348941248262
7. Key: 3099 JP Equity 5831 JP Equity, Value: 1.939742664925484
8. Key: 1808 JP Equity 6971 JP Equity, Value: 1.9132602773493155
9. Key: 4021 JP Equity 9843 JP Equity, Value: 1.8675031161000868
10. Key: 5929 JP Equity 6504 JP Equity, Value: 1.811533049967201


# Machine Learning Challenge

## Background
Initial evaluation of the baseline portfolio shows that draw downs are small. Originally team had the idea of using Machine Learning to optimise for sizing of these pair trades. However since there was no significant drawdowns the returns are linearly increasing with investment sizing i.e. greater nominal investment in the the pair trade the proportionate increase in returns without realising significant drawdown risk.

Instead of optimising for sizing, we can explore Machine Learning in terms of strategy on this stationary dataset. Whereas our prescribed strategy is to enter at +/- 1 std dev, exit at 0 with +/- 2 std dev stop loss. These are only suggestions and arbitrary levels.

With Machine Learning, we can discover if it will uncover the mean reverting nature and recommend another threshhold. We use Q Learner to understand state space with the same spread, mid, std dev parameters as the baseline.

### Steps
#### Environment:
- State Space: A set of all possible states the agent can be in.  
  - [spread, mid, 2 sd low, 1 sd low, 1 sd high, 2 sd high]
- Action Space: A set of all possible actions the agent can take in each state.   
  - [-1, # short\
      0, # uninvested\
      1  # long]   
- Reward Function: A function that assigns a numerical reward to each state-action pair, indicating the immediate consequence of taking a particular action in a specific state.
  - dailypnl
- Transition Function: A function that determines the probability of transitioning from one state to another when a particular action is taken.
  - deterministic based on historical performance
#### Agent:

- Q-Table: A matrix that stores the estimated Q-values for each state-action pair. Q-values represent the expected future reward for taking a specific action in a given state.   
  - continuous Q table?
- Learning Rate (α): A parameter that controls how much the Q-values are updated with each new experience.   
- Discount Factor (γ): A parameter that determines the importance of future rewards. A higher discount factor gives more weight to future rewards.   
- Exploration Rate (ε): A parameter that controls the balance between exploration (trying new actions) and exploitation (choosing the action with the highest Q-value).   
- Q-Learning Algorithm:

  - Initialization: Initialize the Q-table with random values or zeros.   
  - Exploration and Exploitation: Use an exploration strategy (e.g., ε-greedy) to choose an action:
    - With probability ε, choose a random action.   
    - With probability 1-ε, choose the action with the highest Q-value for the current state.   
  
  - Take Action: Execute the chosen action in the environment.   
  - Observe Reward and Next State: Observe the immediate reward and the next state resulting from the action.
- Update Q-Value: Update the Q-value of the current state-action pair using the following formula:

#### Training and Test set

2013 is used for warm start\
2014 - 2023 train data since NN need a lot of training data {end 2023 idx == 2868}\
2024 onwards (5 months) test data


In [5]:
## Get pair stock data
def custom_date_parser(date_str):
    return datetime.strptime(date_str, '%d/%m/%Y')
valid = pd.read_csv('validPairs4.csv', 
                    index_col=0, 
                    parse_dates=True, 
                    date_parser=custom_date_parser)
## get list of pair stocks
validPairsList = [
    [item.strip() + ' Equity' for item in pair.split('Equity') if item.strip()]
    for pair in top_keys
]

  valid = pd.read_csv('validPairs4.csv',


In [6]:
rollingWindow = 262
cutLossSd = 2

In [7]:
for pair in validPairsList:
    df = pd.DataFrame()

    #Calculate Standard Deviations
    df['spread'] = valid[f'spread_{pair[0]}_{pair[1]}']
    df['mid'] =  df['spread'].rolling(rollingWindow).mean()
    df['1sd high'] = df['spread'].rolling(rollingWindow).mean() + df['spread'].rolling(rollingWindow).std()
    df['1sd low'] = df['spread'].rolling(rollingWindow).mean() - df['spread'].rolling(rollingWindow).std()
    df['2sd high'] = df['spread'].rolling(rollingWindow).mean() + df['spread'].rolling(rollingWindow).std() * cutLossSd
    df['2sd low'] = df['spread'].rolling(rollingWindow).mean() - df['spread'].rolling(rollingWindow).std() * cutLossSd
    df['position'] = 0

    df.loc[(df['spread'] > df['1sd high']) & (df['spread'] < df['2sd high']), 'position'] = -1
    df.loc[(df['spread']< df['1sd low']) & (df['spread'] > df['2sd low']), 'position'] = 1

    #Calculate PnL
    df[f'{pair[0]} position'] = df['position']
    df[f'{pair[1]} position'] = df['position'] * -1
    df['dailypnl'] = df[f'{pair[1]} position']*return_df[f'{pair[1]}'].shift(-1) + df[f'{pair[0]} position']*return_df[f'{pair[0]}'].shift(-1)
    df['cumpnl'] = df['dailypnl'].cumsum()

    pairsOutcome[f'{pair[0]} {pair[1]}'] = df

## Make indicators and spread stationary around 0
Deduct the mean from all values to translate to 0 axis

In [8]:
workingPairOutcome = {}

for pair in top_keys:
    dummy_df = pairsOutcome[top_keys[0]].iloc[::,:6]
    dummy_df = dummy_df.subtract(dummy_df['mid'], axis=0).drop(columns=['mid']) # centre spread and SD
    dummy_df = dummy_df.div(dummy_df['2sd high']-dummy_df['1sd high'],axis=0)   # express SD as integers, give spread as propotionate
    dummy_df['1sd_high_boolean'] = (dummy_df['spread']>dummy_df['1sd high']).astype(int)
    dummy_df['2sd_high_boolean'] = (dummy_df['spread']>dummy_df['2sd high']).astype(int)
    dummy_df['1sd_low_boolean'] =  (dummy_df['spread']<dummy_df['1sd low'] ).astype(int)
    dummy_df['2sd_low_boolean'] =  (dummy_df['spread']<dummy_df['2sd low'] ).astype(int)
    dummy_df = dummy_df.drop(columns=['spread','1sd high', '1sd low', '2sd high', '2sd low'])
    workingPairOutcome[pair] = dummy_df.to_numpy()

In [9]:
workingPairOutcome[top_keys[5]][-5:]  

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])

In [10]:
# workingPairOutcome = {top_keys[0]:workingPairOutcome[top_keys[0]]}

- Test one timestep at a time (even though we can test all at the same time)
- give state
- Trading should be path dependent due to stop loss. in this case I can only give last position as one of the parameters

In [11]:
import gym
import random

class PairTradeEnv(gym.Env):
    # ... (define your environment's state space, action space, etc.)
    def __init__(self, workingPairOutcome, top_keys, validPairsList, return_df):
        # ... (initialize other parameters)
        self.earliest_step = 261  # hot start
        self.last_step = 2868
        # self.current_step = random.randint(self.earliest_step, self.last_step - 1)
        self.current_step = self.earliest_step


    def step(self, action, pair_idx):
        """
        Input
            action: single value e.g. -1 (short)
            pair_idx: index of pair trade
        Output:
            next_state: next state 
            reward: reward for last timestep
            done: boolean for if end of dataset
            info: optional
        """
        # Advance the time step
        self.current_step += 1
        # Get the next state
        next_state = workingPairOutcome[top_keys[pair_idx]][self.current_step]
        # Calculate reward (implement your reward function here)
        reward = self.calculate_reward(action, self.current_step, validPairsList[pair_idx])
        # Check for termination (implement your termination condition here)
        done = self.current_step >= self.last_step

        # Provide additional information (optional)
        info = {}

        return next_state, reward, done, info

    def reset(self, pair_idx):
        # ... (implement the reset function to initialize the environment)
        # reset to start of 2014 every time
        # self.current_step = random.randint(self.earliest_step, self.last_step - 1)
        self.current_step = self.earliest_step
        initial_state = workingPairOutcome[top_keys[pair_idx]][self.current_step]
        return initial_state
    
    def calculate_reward(self, position, idx, pair):
        """
        Give one _previous_ day's return
        Input:
            position: position for idx (current step)
            idx: usually current timestp 
            pair: tuple of tpx stock
        Output:
            dailypnl
        """
        # position = position_vector @ np.array([-1,0,1])
        position_0 = position
        position_1 = position * -1
        ## return_df gives the return for the previous day for the given idx
        dailypnl = position_0*return_df[f'{pair[0]}'].iloc[idx] + position_1*return_df[f'{pair[1]}'].iloc[idx] 

        return dailypnl

# Instantiate the custom environment
env = PairTradeEnv(workingPairOutcome, top_keys, validPairsList, return_df)

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

class QNetwork(nn.Module):
    def __init__(self, input_size, output_size, dropout_rate=0.2):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.dropout1 = nn.Dropout(p=dropout_rate) 
        self.fc2 = nn.Linear(32, 4)
        self.dropout2 = nn.Dropout(p=dropout_rate)
        self.fc3 = nn.Linear(4, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

class QLearningAgent:
    def __init__(self, input_size, output_size, learning_rate, discount_factor, epsilon, batch_size=1000, replay_buffer_size=10000):
        self.q_network = QNetwork(input_size, output_size)
        self.target_network = QNetwork(input_size, output_size)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=learning_rate) 
        self.loss_fn = nn.MSELoss()
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.learn_count = 0
        self.batch_size = batch_size
        self.state_visit_counts = {}
        self.rare_states = {}
        self.common_states = {}
        self.replay_buffer = deque(maxlen=replay_buffer_size)
        
        # Action to index mapping
        self.action_to_index = {-1: 0, 0: 1, 1: 2}
        self.index_to_action = {0: -1, 1: 0, 2: 1}


    def update_state_visit_counts(self, state):
        state_tuple = tuple(state)  # Convert state to a hashable type
        if state_tuple not in self.state_visit_counts:
            self.state_visit_counts[state_tuple] = 0
        self.state_visit_counts[state_tuple] += 1

    def update_rare_and_common_states(self):
        visit_counts = np.array(list(self.state_visit_counts.values()))
        if len(visit_counts) == 0:
            return

        rare_threshold = np.percentile(visit_counts, 20)  # Bottom 20% of visits
        common_threshold = np.percentile(visit_counts, 80)  # Top 20% of visits

        self.rare_states = {state for state, count in self.state_visit_counts.items() if count <= rare_threshold}
        self.common_states = {state for state, count in self.state_visit_counts.items() if count >= common_threshold}

    def reward_shaping(self, state):
        if state in self.rare_states:
            return 10  # Bonus for rare states
        elif state in self.common_states:
            return -1  # Penalty for common states
        return 0

    def store_experience(self, state, action, reward, next_state, done):
        self.update_state_visit_counts(next_state)  # Track the visit count for the next state
        self.replay_buffer.append((state, action, reward, next_state, done))

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            action = np.random.choice([-1, 0, 1])  # Explore
        else:
            with torch.no_grad():
                q_values = self.q_network(torch.tensor(state, dtype=torch.float32).unsqueeze(0))
                action_index = torch.argmax(q_values, dim=1).item()  # Choose best action
            action = self.index_to_action[action_index]  # Map index to action
        return action

    def learn(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        self.learn_count += 1

        # Get samples randomly
        batch = random.sample(self.replay_buffer, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(states, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        actions_np = np.array([np.array(self.action_to_index[action]) for action in actions])
        actions = torch.tensor(actions_np, dtype=torch.long).view(-1, 1)
        rewards = torch.tensor(rewards, dtype=torch.float32).view(-1, 1)
        dones = torch.tensor(dones, dtype=torch.float32).view(-1, 1)

        print(states)
        print(actions)

        # Apply reward shaping
        shaping_rewards = torch.tensor([
            self.reward_shaping(state)
            for state, _, _ in zip(states, actions_np, next_states)
        ], dtype=torch.float32).view(-1, 1)
        modified_rewards = rewards + shaping_rewards
        
        # Get Q-values for each action
        q_values = self.q_network(states).gather(1, actions)

        # Get max Q-value for the next states from target network
        next_q_values = self.target_network(next_states).max(1, keepdim=True)[0].detach()
        target_q_values = modified_rewards + self.discount_factor * next_q_values * (1 - dones)

        # Compute loss and update the Q-network
        loss = self.loss_fn(q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network every few episodes
        if self.learn_count % 500 == 0:
            self.update_target_network()
        if self.learn_count % 3000 == 0:
            self.epsilon = max(0.1 , self.epsilon * .995)

    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())

input_size = 4
output_size = 3  # Number of discretactions
learning_rate = 0.15
discount_factor = 0.99
epsilon = 1.0

agent = QLearningAgent(input_size, output_size, learning_rate, discount_factor, epsilon)


## Training constants
total_episodes = 1
number_of_pairs = len(workingPairOutcome)
ls_total_reward = []

# Simulating agent learning (in practice, use a loop with environment interaction)
for episode in range(total_episodes):
    # agent.epsilon = 1 - ((episode+1) / total_episodes) ** 2
    arr_pair_reward = np.zeros(number_of_pairs)

    for pair_idx in range(number_of_pairs):
        state = env.reset(pair_idx)
        pair_reward = 0
        done = False
        
        while not done:
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action, pair_idx)
            pair_reward += reward
            # TODO remove after experiment
            # if reward < 0:
            #     reward *= 100

            agent.store_experience(state, action, reward, next_state, done)
            agent.learn()
            
            state = next_state

        arr_pair_reward[pair_idx] = pair_reward
    
    agent.update_rare_and_common_states()  # Reclassify rare and common states
    total_reward = arr_pair_reward.mean()
    print(f"Episode {episode+1}: Total Reward: {total_reward:.3f}, Epsilon: {agent.epsilon:.2f}")
    # Capture total_reward for further analysis
    ls_total_reward.append(total_reward)

# evaluate random and final trained performance
plt.plot(ls_total_reward)

# After training, save the entire Q-network
torch.save(agent.q_network, 'q_network.pth')

  states = torch.tensor(states, dtype=torch.float32)


tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 1., 1.]])
tensor([[2],
        [2],
        [0],
        [1],
        [2],
        [0],
        [0],
        [0],
        [2],
        [1],
        [2],
        [1],
        [0],
        [1],
        [0],
        [2],
        [1],
        [1],
        [2],
        [0],
        [0],
        [1],
        [2],
        [0],
        [2],
        [2],
        [2],
        [0],
        [1],
        [1],
        [1],
        [0],
        [0],
        [0],
        [1],
        [2],
        [1],
        [2],
        [0],
        [0],
        [1],
        [1],
        [2],
        [1],
        [2],
        [0],
        [2],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [2],
        [1],
        [2],
        [2],
        [2],
        [1],
        [1]

KeyboardInterrupt: 

### 9 Dec
- state cases should be imbalanced


In [None]:
loaded_q_network = torch.load('q_network.pth')

loaded_q_network = QNetwork()  # Replace QNetwork with your model class
loaded_q_network.load_state_dict(torch.load('q_network_weights.pth'))
loaded_q_network.eval()

  loaded_q_network = torch.load('q_network.pth')


TypeError: QNetwork.__init__() missing 2 required positional arguments: 'input_size' and 'output_size'

In [None]:
## Get baseline results
# t_pair = validPairsList[0]
# max_steps_per_episode = 3000

# def get_baseline(env, max_steps_per_episode, t_pair):
#     env.reset()
#     total_reward = 0
#     current_step = 261
#     env.current_step = current_step
#     env.last_step = 2868

#     for step in range(max_steps_per_episode):
#         action = pairsOutcome[t_pair].iloc[env.current_step]['position']
#         _, reward, done, _ = env.step(action)
#         total_reward += reward

#         if done:
#             break

#     print(f"Baseline {t_pair}, Total Reward: {total_reward}, step {step}")

# get_baseline(env, 3000, top_keys[0])

### 30 Nov
- first few tries, network is very large
- added epsilon search in "choose_action" functionso that there will be some chance to explore
- changed reward function to multiply losses and give exponential returns to incentivise risk taking

### 1 dec 2105: 
- might have performance is always oscillating negative and positive. This might be because of too large a learning rate. also start from start of training periods max steps to be 3000 so that total results are comparable
    - this helped quite abit. 
    
`
input_size = 7  # Adjust to your specific input size
output_size = 3  # Adjust to your desired number of discrete actions
learning_rate = 0.1
discount_factor = 0.8
epsilon = 1 # down to .3
epsilon_decay = 0.9999
num_episodes = 500
max_steps_per_episode = 3000
`

- want to try changing epsilon to only update after the entire episode instead of after each step. its decaying too quickly
- I want to try with changing reward by changing "learn" to use total_reward instead of "reward"
- Scale the states. need to explore scaling the state since it is still in terms of absolute differences. NN is not able to do proportions
- training epochs should be smaller at up to 30 days because mean reversion pattern is 1 to 33 days
    - very bad performance with 40 day epochs

### 1 dec 2217:
- changed target q value fxn to remove exponential reward and scaled negative reward. now both positive and negative are the same. added portion of total reward in episode to incentivise more long term rewards.
    - `        if reward > 0:
            target_q_value = reward + self.discount_factor * next_q_value * (1 - done) + total_reward * .1
        else:
            target_q_value = reward + self.discount_factor * next_q_value * (1 - done) + total_reward * .1`
    -       `  if episode%1==0:
            agent.epsilon *= agent.epsilon_decay`

### 2 Dec 2101:
- managed to scale but results are not any better
- thinking of reducing learning rate to reduce the oscillations
    - will try to run with learning rate at 0.01
- right now total reward is taking all of the target q function. maybe can make it a 50/50 split

### 3 dec
- training taking 
    - a full length training dataset.
    - 1000 per learning step
    - learning rate test (1.5 mins per episode)
        - episodes: 5
        - learning rate: 0.05 ==> total reward: .037
        - learning rate: 0.5 ==> total reward: -.6
        - learning rate: 0.3 ==> total reward: 0.88, .044
        - learning rate: 0.15 ==> total reward: -.023
        - learning rate: 0.25 ==> total reward: -.6
        - learning rate: 0.35 ==> total reward: .028
    - with drop out layer test (1.75 mins per episode)
        - learning rate: 0.3 ==> total reward: -0.488
        - learning rate: 0.4 ==> total reward: -0.422
        - learning rate: 0.5 ==> total reward: .26, .13, .096
        - learning rate: 0.6 ==> total reward: .03
        - learning rate: 0.7 ==> total reward: -.08
- performance still bad. should include dropout layer? --> performance a bit worse but more consistent
- try removing spread so that the input is only boolean of SD and last position
    - drop out layer test (1.75 mins per episode)
        - learning rate: 0.1 ==> total reward: -.36
        - learning rate: 0.3 ==> total reward: -.01
        - learning rate: 0.5 ==> total reward: -.26
### 5 dec
- previously target and online network updated at the same time, but it should be used to regularise. So will try with updating more periodically instead of every learn step. reduced NN to 16 and 8 hidden layers with dropout (1.67 min per episode)
    - learning rate: 0.3, update target every 10 learn occurences ==> total reward: .03
    - learning rate: 0.3, update target every 100 learn occurences ==> total reward: .26
    - learning rate: 0.3, update target every 500 learn occurences ==> total reward: -.383
    - learning rate: 0.3, update target every 600 learn occurences ==> total reward: .22
    - learning rate: 0.3, update target every 675 learn occurences ==> total reward: .41
    - learning rate: 0.3, update target every 700 learn occurences ==> total reward: .10, .03
    - learning rate: 0.3, update target every 750 learn occurences ==> total reward: .40, .15, -0.353
    - learning rate: 0.3, update target every 1000 learn occurences ==> total reward: .05
    - learning rate: 0.3, update target every 10 learn occurences, remove dropout layers ==> total reward: -.203
    - learning rate: 0.5, update target every 10 learn occurences, remove dropout layers ==> total reward: -.334
- changed ADAM optimiser to SGD (1.6 min per episode)
    - learning rate: 0.15, update target every 750 learn occurences ==> total reward: .084
    - learning rate: 0.3, update target every 750 learn occurences ==> total reward: -.20
    - learning rate: 0.5, update target every 750 learn occurences ==> total reward: .303
    - learning rate: 0.6, update target every 750 learn occurences ==> total reward: -.110
    - learning rate: 0.7, update target every 750 learn occurences ==> total reward: -.556, -0.449
- long term run with 50 episodes
    - SGD, learning rate: 0.5, update target every 750 learn occurences ==> total reward: -.04 (79 min)
    - ADAM, learning rate: 0.3, update target every 750 learn occurences ==> total reward: -.01 (84.5 min)
    - ADAM, 32 hidden layer, learning rate: 0.3, update target every 750 learn occurences ==> total reward: 0.54 (88 min)
- long term run with 300 episodes
    - ADAM, 32X16 hidden layer, learning rate: 0.3, update target every 750 learn occurences ==> total reward:  -0.28(540 min)
### 6 Dec
1. reduce to only 4 SD flags - Done
2. discount factor up to .99 - Done
3. try increasing punishment with X10 negative reward if less than 0 - Done

- there is a positive gradient
    - 5 episodes, X10 negative reward ADAM, 32X16 hidden layer, learning rate: 0.3, update target every 750 learn occurences ==> total reward: .05
    - 5 episodes, X10 negative reward ADAM, 32X8 hidden layer, learning rate: 0.3, update target every 750 learn occurences ==> total reward: -0.01
    - 5 episodes, X100 negative reward ADAM, 32X16 hidden layer, learning rate: 0.3, update target every 750 learn occurences ==> total reward: -0.01 (8 mins)
    - 5 episodes, X100 negative reward ADAM, 32X16 hidden layer, learning rate: 0.3, update target every 750 learn occurences ==> total reward: 0.014
    - 300 episodes, X100 negative reward ADAM, 32X16 hidden layer, learning rate: 0.3, update target every 750 learn occurences ==> total reward: 0.017
    - 50 episodes, X100 negative reward ADAM, 32X16 hidden layer, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.99 ==> total reward: 
- try reducing net to only 4 neurons
    - 5 episodes, X100 negative reward ADAM, 4X4 hidden layer, learning rate: 0.3, update target every 750 learn occurences ==> total reward: 0.0
    - 10 episodes, X100 negative reward ADAM, 4X4 hidden layer, learning rate: 0.3, update target every 750 learn occurences ==> total reward: 0.0
### 8 Dec 
- 50 episodes, X100 negative reward ADAM, 4X4 hidden layer, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.99 ==> total reward: 0.118
- 300 episodes, X100 negative reward ADAM, 4X4 hidden layer, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.99 ==> total reward: -0.02 (epsilon 10), 0.29 (epsilon .46)
- try removing previous action in state space to move to 4 state spaces only . 4X4
    - 4X4, 10 episodes, X100 negative reward ADAM, 4X4 hidden layer, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.99 ==> total reward: 0.282
    - 4X4, epsilon_decay = 0.955, 20 episodes, X100 negative reward ADAM, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.99 ==> total reward: -.20
    - 4X4, epsilon_decay = 0.9885, 20 episodes, X100 negative reward ADAM, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.99 ==> total reward: 0.08
    - 32X4, epsilon_decay = 0.9885, 20 episodes, X100 negative reward ADAM, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.99 ==> total reward: 0.07
    - add back dropout layers, 32X4, epsilon_decay = 0.9885, 20 episodes, X100 negative reward ADAM, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.99 ==> total reward: .21
    - add back dropout layers, 32X4, epsilon_decay = 0.9885, 10 episodes, X100 negative reward ADAM, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.5 ==> total reward: .002
    - add back dropout layers, 32X4, epsilon_decay = 0.9885, 10 episodes, X100 negative reward ADAM, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.5 ==> total reward: .002
    - add back dropout layers, 32X4, epsilon_decay = 0.9885, 300 episodes, X100 negative reward ADAM, learning rate: 0.3, update target every 750 learn occurences, discount_factor = 0.99 ==> total reward: 0.115