In [99]:
# # **Project 2: Stock Portfolio Optimization - Assignment 3**
# Athanasakis Evangelos 2019030118 // Fragkogiannis Yiorgos 2019030039


# Importing libraries


import numpy as np
import tkinter as tk #loads standard python GUI libraries
import random
from tkinter import *
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
from collections import deque
from tqdm import tqdm
import optuna




# Environments

In [100]:
#-------------------------------__________________Environments___________________-------------------------------------------------------------------------
# Generating environments


# Create the three different environments
# We are modeling this environment using 8 states in the format: {stock_currently_holding,state_of_stock_1,state_of_stock_2}

action_keep = 0     # keep the same stock
action_switch = 1   # switch to the other stock

# This environment is used for the question 1 where we need to demonstrate that the optimal
# policy is always to stay with the stock we already have invested
fee = -0.9
# r1H = 2*r2L
# in this case r1.h=0.1 // r2.H= 0.05 // r1.L = -0.02 // r2.L = 0.01
# we have used a large transaction fee so that the best policy will always be to keep using the same stock
P1 = {

    # State {1,L,L}
    0:{
        action_keep: [
             (9/20, 0, -0.02),    # probability: 9/20, next_State: {1,L,L}, Reward: -0.02
             (1/20, 1, -0.02),    # {1,L,H}
             (9/20, 2, +0.1),     # {1,H,L}
             (1/20, 3, +0.1)      # {1,H,H}
        ],

        action_switch:[
            (9/20, 4, +0.01 + fee),    # {2,L,L}
            (1/20, 5, +0.05 + fee),    # {2,L,H}
            (9/20, 6, +0.01 + fee),    # {2,H,L}
            (1/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,L,H}
    1:{
        action_keep: [
             (1/20, 0, -0.02),  # {1,L,L}
             (9/20, 1, -0.02),  # {1,L,H}
             (1/20, 2, +0.1 ),  # {1,H,L}
             (9/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch:[
            (1/20, 4, +0.01 + fee),    # {2,L,L}
            (9/20, 5, +0.05 + fee),    # {2,L,H}
            (1/20, 6, +0.01 + fee),    # {2,H,L}
            (9/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,H,L}
    2:{
        action_keep: [
             (9/20, 0, -0.02),  # {1,L,L}
             (1/20, 1, -0.02),  # {1,L,H}
             (9/20, 2, +0.1 ),  # {1,H,L}
             (1/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch:[
            (9/20, 4, +0.01 + fee),    # {2,L,L}
            (1/20, 5, +0.05 + fee),    # {2,L,H}
            (9/20, 6, +0.01 + fee),    # {2,H,L}
            (1/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,H,H}
    3:{
        action_keep: [
             (1/20, 0, -0.02),  # {1,L,L}
             (9/20, 1, -0.02),  # {1,L,H}
             (1/20, 2, +0.1 ),  # {1,H,L}
             (9/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch: [
            (1/20, 4, +0.01 + fee),    # {2,L,L}
            (9/20, 5, +0.05 + fee),    # {2,L,H}
            (1/20, 6, +0.01 + fee),    # {2,H,L}
            (9/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {2,L,L}
    4:{
        action_keep: [
             (9/20, 4,  +0.01),    # {2,L,L}
             (1/20, 5,  +0.05),    # {2,L,H}
             (9/20, 6,  +0.01),    # {2,H,L}
             (1/20, 7,  +0.05)     # {2,H,H}
        ],

        action_switch:[
             (9/20, 0, -0.02 + fee),  # {1,L,L}
             (1/20, 1, -0.02 + fee),  # {1,L,H}
             (9/20, 2, +0.1  + fee),  # {1,H,L}
             (1/20, 3, +0.1  + fee)   # {1,H,H}
        ]
    },

    # State {2,L,H}
    5:{
        action_keep: [
             (1/20, 4, +0.01),    # {2,L,L}
             (9/20, 5, +0.05),    # {2,L,H}
             (1/20, 6, +0.01),    # {2,H,L}
             (9/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
            (1/20, 0, -0.02 + fee),  # {1,L,L}
            (9/20, 1, -0.02 + fee),  # {1,L,H}
            (1/20, 2, +0.1  + fee),  # {1,H,L}
            (9/20, 3, +0.1  + fee)   # {1,H,H}
        ]
    },

    # State {2,H,L}
    6:{
        action_keep: [
             (9/20, 4, +0.01),    # {2,L,L}
             (1/20, 5, +0.05),    # {2,L,H}
             (9/20, 6, +0.01),    # {2,H,L}
             (1/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
             (9/20, 0, -0.02 + fee),  # {1,L,L}
             (1/20, 1, -0.02 + fee),  # {1,L,H}
             (9/20, 2, +0.1  + fee),  # {1,H,L}
             (1/20, 3, +0.1  + fee)   # {1,H,H}
        ]
    },

    # State {2,H,H}
    7:{
        action_keep: [
             (1/20, 4, +0.01),    # {2,L,L}
             (9/20, 5, +0.05),    # {2,L,H}
             (1/20, 6, +0.01),    # {2,H,L}
             (9/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
             (1/20, 0, -0.02 + fee),  # {1,L,L}
             (9/20, 1, -0.02 + fee),  # {1,L,H}
             (1/20, 2, +0.1  + fee),  # {1,H,L}
             (9/20, 3, +0.1  + fee)   # {1,H,H}
        ]
    }

}


# This environment implements the stocks environment from the midterm
# It is used for the question 2 where we need to demonstrate that the optimal policy
# for some of the states is to switch and in some others to stay
fee = -0.01
P2 = {

    # State {1,L,L}
    0:{
        action_keep: [
             (9/20, 0, -0.02),    # probability: 9/20, next_State: {1,L,L}, Reward: -0.02
             (1/20, 1, -0.02),    # {1,L,H}
             (9/20, 2, +0.1),     # {1,H,L}
             (1/20, 3, +0.1)      # {1,H,H}
        ],

        action_switch:[
            (9/20, 4, +0.01 + fee),    # {2,L,L}
            (1/20, 5, +0.05 + fee),    # {2,L,H}
            (9/20, 6, +0.01 + fee),    # {2,H,L}
            (1/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,L,H}
    1:{
        action_keep: [
             (1/20, 0, -0.02),  # {1,L,L}
             (9/20, 1, -0.02),  # {1,L,H}
             (1/20, 2, +0.1 ),  # {1,H,L}
             (9/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch:[
            (1/20, 4, +0.01 + fee),    # {2,L,L}
            (9/20, 5, +0.05 + fee),    # {2,L,H}
            (1/20, 6, +0.01 + fee),    # {2,H,L}
            (9/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,H,L}
    2:{
        action_keep: [
             (9/20, 0, -0.02),  # {1,L,L}
             (1/20, 1, -0.02),  # {1,L,H}
             (9/20, 2, +0.1 ),  # {1,H,L}
             (1/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch:[
            (9/20, 4, +0.01 + fee),    # {2,L,L}
            (1/20, 5, +0.05 + fee),    # {2,L,H}
            (9/20, 6, +0.01 + fee),    # {2,H,L}
            (1/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,H,H}
    3:{
        action_keep: [
             (1/20, 0, -0.02),  # {1,L,L}
             (9/20, 1, -0.02),  # {1,L,H}
             (1/20, 2, +0.1 ),  # {1,H,L}
             (9/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch: [
            (1/20, 4, +0.01 + fee),    # {2,L,L}
            (9/20, 5, +0.05  + fee),    # {2,L,H}
            (1/20, 6, +0.01 + fee),    # {2,H,L}
            (9/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {2,L,L}
    4:{
        action_keep: [
             (9/20, 4,  +0.01),    # {2,L,L}
             (1/20, 5,  +0.05),    # {2,L,H}
             (9/20, 6,  +0.01),    # {2,H,L}
             (1/20, 7,  +0.05)     # {2,H,H}
        ],

        action_switch:[
             (9/20, 0, -0.02 + fee),  # {1,L,L}
             (1/20, 1, -0.02 + fee),  # {1,L,H}
             (9/20, 2, +0.1 + fee),  # {1,H,L}
             (1/20, 3, +0.1 + fee)   # {1,H,H}
        ]
    },

    # State {2,L,H}
    5:{
        action_keep: [
             (1/20, 4, +0.01),    # {2,L,L}
             (9/20, 5, +0.05),    # {2,L,H}
             (1/20, 6, +0.01),    # {2,H,L}
             (9/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
            (1/20, 0, -0.02 + fee),  # {1,L,L}
            (9/20, 1, -0.02 + fee),  # {1,L,H}
            (1/20, 2, +0.1 + fee),  # {1,H,L}
            (9/20, 3, +0.1 + fee)   # {1,H,H}
        ]
    },

    # State {2,H,L}
    6:{
        action_keep: [
             (9/20, 4, +0.01),    # {2,L,L}
             (1/20, 5, +0.05),    # {2,L,H}
             (9/20, 6, +0.01),    # {2,H,L}
             (1/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
             (9/20, 0, -0.02 + fee),  # {1,L,L}
             (1/20, 1, -0.02 + fee),  # {1,L,H}
             (9/20, 2, +0.1 + fee),  # {1,H,L}
             (1/20, 3, +0.1 + fee)   # {1,H,H}
        ]
    },

    # State {2,H,H}
    7:{
        action_keep: [
             (1/20, 4, +0.01),    # {2,L,L}
             (9/20, 5, +0.05),    # {2,L,H}
             (1/20, 6, +0.01),    # {2,H,L}
             (9/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
             (1/20, 0, -0.02 + fee),  # {1,L,L}
             (9/20, 1, -0.02 + fee),  # {1,L,H}
             (1/20, 2, +0.1 + fee),  # {1,H,L}
             (9/20, 3, +0.1 + fee)   # {1,H,H}
        ]
    }

}


# This environment implements the generic scenario of question 3 where for every stock
# ri_H,ri_L are chosen uniformly in [-0.02, 0.1] and transition probabilities pi_HL, pi_LH
# are equal to 0.1 for half the stocks and 0.5 for the other half.

# Since every stock can have two price states, the number of total states in the MDP
# we are creating will be = NumOfStoscks*2^numOfStocks


def decimal_to_binary_array(decimal, length):

    # Convert decimal to binary string (strip '0b' prefix)
    binary_string = bin(decimal)[2:]

    # Determine padding length
    padding_length = max(0, length - len(binary_string))

    # Pad binary string with leading zeros if needed
    padded_binary_string = '0' * padding_length + binary_string

    # Convert padded binary string to list of binary digits
    binary_array = [int(bit) for bit in padded_binary_string]

    return binary_array


# Function that generates the environment of N stocks dynamically, with a transaction fee
def generate_environment(N,fee):

    states_for_each_stock = 2**N
    total_states = N * states_for_each_stock
    max_state_length = N

    P = {}
    pi = []
    #Creating transition probabilities for the keep action
    #of EACH stock
    for i in range(0,N):
        if(i < N/2):
            # pi_HL = pi_LH = 0.1 | # pi_HH = pi_LL = 0.9
            row = [0.9,0.1,0.1,0.9] #[LL,LH,HL,HH]
        else:
            # pi_HL = pi_LH = 0.5 | # pi_HH = pi_LL = 0.5
            row = [0.5,0.5,0.5,0.5] #[LL,LH,HL,HH]
        pi.append(row)

    progress_bar = tqdm(range(0, total_states))
    for i in progress_bar:
        SubDictionary={}
        action_Keep = []
        action_Switch = []

        # find what stock we are reffering to
        # Stock ids start from 0
        stock = i // states_for_each_stock

        ##########################
        # We define states of L and H with binary ids
        # For example for 2 stocks this translation occurs:
        # LL -> 0,0 -> 0
        # LH -> 0,1 -> 1
        # HL -> 1,0 -> 2
        # HH -> 1,1 -> 3
        # The binary ids are then translated to decimals so that
        # we can use them in code
        ##########################

        current_state = i - stock * states_for_each_stock # find where this specific stock starts at the total_states environment
                                                          # this is necessary to calculate the transition probabilities

        # Convert decimal to binary string
        # Convert the binary string to a list of integers (0s and 1s)
        curr_state_array = decimal_to_binary_array(current_state, max_state_length)
        # We can now use the array to find if each stock is in high (1s) or low (0s) state
        # So We now know that we are at state {x,L,L,H....,H} with x the number of current stock

        #__Keep Stock ________________________________________________________________________________________________________________
        # progress_1 = tqdm(range (stock*2**N, ((stock+1)*2**N)))
        for j in range (stock*2**N, ((stock+1)*2**N)): # for every possible transition when keeping the same stock
            state_to_trans = j - stock * states_for_each_stock          # value (H or L) of all of the stocks at the state we will transition to, in decimal form (0,1,2,3...)
            trans_state_array = decimal_to_binary_array(state_to_trans, max_state_length) # convert to binary and take each bit separately (0 for L and 1 for H)

            transitionProb = 1

            for k in range(len(trans_state_array)):
                stock_state_trans = trans_state_array[k] # 0 or 1 // low or high
                stock_state_current = curr_state_array[k] # 0 or 1 // low or high

                if(stock_state_current == 0 and stock_state_trans == 0):       # Pi_LL
                    transitionProb = transitionProb * pi[stock][0]
                elif(stock_state_current == 0 and stock_state_trans == 1):     # pi_LH
                    transitionProb = transitionProb * pi[stock][1]
                elif(stock_state_current == 1 and stock_state_trans == 0):     # pi_HL
                    transitionProb = transitionProb * pi[stock][2]
                else:                                                          # pi_HH
                    transitionProb = transitionProb * pi[stock][3]

            nextState = j
            #reward = random.uniform(-0.02, 20)
            reward = random.uniform(-0.02, 0.1)
            action_Keep.append((transitionProb,nextState,reward))
        #-----------------------------------------------------------------------------------------------------------------------------------------------
        #fee = 0
        #__Switch Stock ________________________________________________________________________________________________________________
        # progress_bar = tqdm(range (0, total_states))
        for j in range (0, total_states): # for every possible transition when keeping the same stock
            trans_stock = j // states_for_each_stock

            if(trans_stock == stock):     # check if the transition stock is the same as the stock we start from
                continue                  # we have already handle this situation above so we move on


            trans_state = j - trans_stock * states_for_each_stock
            trans_state_array = decimal_to_binary_array(trans_state, max_state_length)
            transitionProb = 1

            for k in range(len(trans_state_array)):
                stock_state_trans = trans_state_array[k] # 0 or 1 // low or high
                stock_state_current = curr_state_array[k] # 0 or 1 // low or high

                if(stock_state_current == 0 and stock_state_trans == 0):       # Pi_LL
                    transitionProb = transitionProb * pi[stock][0]
                elif(stock_state_current == 0 and stock_state_trans == 1):     # pi_LH
                    transitionProb = transitionProb * pi[stock][1]
                elif(stock_state_current == 1 and stock_state_trans == 0):     # pi_HL
                    transitionProb = transitionProb * pi[stock][2]
                else:                                                          # pi_HH
                    transitionProb = transitionProb * pi[stock][3]

            nextState = j
            #reward = random.uniform(-0.02, 20) - fee
            reward = random.uniform(-0.02, 0.1) - fee
            action_Switch.append((transitionProb,nextState,reward))


        #-----------------------------------------------------------------------------------------------------------------------------------------------
        SubDictionary[action_keep] = action_Keep
        SubDictionary[action_switch] = action_Switch
        P[i]=SubDictionary



    return P



## Phase 1, Policy Evaluation/Iteration

In [101]:

def policy_evaluation(pi, P, gamma = 1.0, epsilon = 1e-10):  #inputs: (1) policy to be evaluated, (2) model of the environment (transition probabilities, etc., see previous cell), (3) discount factor (with default = 1), (4) convergence error (default = 10^{-10})
    #print("in policy EVALUATION")
    t = 0   #there's more elegant ways to do this
    prev_V = np.zeros(len(P)) # use as "cost-to-go", i.e. for V(s')
    while True:
        V = np.zeros(len(P)) # current value function to be learnerd
        for s in range(len(P)):  # do for every state
            for prob, next_state, reward in P[s][pi(s)]:  # calculate one Bellman step --> i.e., sum over all probabilities of transitions and reward for that state, the action suggested by the (fixed) policy, the reward earned (dictated by the model), and the cost-to-go from the next state (which is also decided by the model)
                V[s] = np.int64(V[s] + prob * (reward + gamma * prev_V[next_state]))
        if np.max(np.abs(prev_V - V)) < epsilon: #check if the new V estimate is close enough to the previous one;     
            break # if yes, finish loop
        prev_V = V.copy() #freeze the new values (to be used as the next V(s'))
        t += 1
    return V


def policy_improvement(V, P, gamma=1.0):  # takes a value function (as the cost to go V(s')), a model, and a discount parameter
    #print("in policy IMPROVEMENT")
    Q = np.zeros((len(P), len(P[0])), dtype=np.float64) #create a Q value array
    for s in range(len(P)):        # for every state in the environment/model
        for a in range(len(P[s])):  # and for every action in that state
            for prob, next_state, reward in P[s][a]:  #evaluate the action value based on the model and Value function given (which corresponds to the previous policy that we are trying to improve) 
                Q[s][a] += prob * (reward + gamma * V[next_state])
    new_pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]  # this basically creates the new (improved) policy by choosing at each state s the action a that has the highest Q value (based on the Q array we just calculated)
    # lambda is a "fancy" way of creating a function without formally defining it (e.g. simply to return, as here...or to use internally in another function)
    # you can implement this in a much simpler way, by using just a few more lines of code -- if this command is not clear, I suggest to try coding this yourself
    
    return new_pi,Q

# policy iteration is simple, it will call alternatively policy evaluation then policy improvement, till the policy converges.

def policy_iteration(P, gamma = 1.0, epsilon = 1e-10):
    t = 0
    random_actions = np.random.choice(tuple(P[0].keys()), len(P))     # start with random actions for each state  
    pi = lambda s: {s:a for s, a in enumerate(random_actions)}[s]     # and define your initial policy pi_0 based on these action (remember, we are passing policies around as python "functions", hence the need for this second line)
    #print("Policy in first iteration:")
    #print_policy(pi,len(P))
    #print("\n")
    while True:
        old_pi = {s: pi(s) for s in range(len(P))}  #keep the old policy to compare with new
        V = policy_evaluation(pi,P,gamma,epsilon)   #evaluate latest policy --> you receive its converged value function
        pi,Q_table = policy_improvement(V,P,gamma)          #get a better policy using the value function of the previous one just calculated 
        
        t += 1    
        if old_pi == {s:pi(s) for s in range(len(P))}: # you have converged to the optimal policy if the "improved" policy is exactly the same as in the previous step
            break
    print('Converged after %d Policy Iterations' %t) #keep track of the number of (outer) iterations to converge
    return V,pi,Q_table


# Function to print policy
def print_policy(policy, num_states=8):
    for s in range(num_states):
        print(f"State {s}: Action {policy(s)}")
     

Useful Functions for Tubular Qlearning and DQL

In [102]:
def calculate_difference_and_mse(q1, q2):
    # Ensure the tables have the same dimensions
    if len(q1) != len(q2) or any(len(row1) != len(row2) for row1, row2 in zip(q1, q2)):
        raise ValueError("Both tables must have the same dimensions.")
    
    result = []
    total_squared_error = 0
    num_elements = 0
    
    for row1, row2 in zip(q1, q2):
        row_diff = []
        for element1, element2 in zip(row1, row2):
            diff = element1 - element2
            row_diff.append(diff)
            total_squared_error += diff ** 2
            num_elements += 1
        result.append(row_diff)
    
    mse = total_squared_error / num_elements
    return result, mse


def check_q_table_convergence(prev_Q, current_Q, epsilon=0.001):
    """
    Checks if the Q-table has converged.

    Parameters:
    - prev_Q (np.ndarray): Previous Q-table.
    - current_Q (np.ndarray): Current Q-table.
    - epsilon (float): Convergence threshold.

    Returns:
    - bool: True if Q-table has converged, False otherwise.
    """
    if prev_Q is None:
        return False  # Cannot determine convergence without a previous Q-table
    
    # Calculate the maximum absolute difference between corresponding Q-values
    max_diff = np.max(np.abs(prev_Q - current_Q))
    
    # Check if the maximum difference is less than epsilon
    if max_diff < epsilon:
        return True  # Q-table has converged
    
    return False  # Q-table has not converged yet


# This function is used to simulate the environments response
# It gets as input the environment, the current state and the action that we have selected
# and it returns the next state and the reward
def get_response(environment, state, action):
    P = environment
    
    response = P[state][action] # get next states, transition probabilities and transaction rewards
                                # based on the current state and the action we want to make   

    # we use random.choices to get a random next state based on the weighted probabilities of the next states
    probabilities = []
    choices = range(len(P[state][action]))
    for i in range(len(P[state][action])): 
        probabilities.append(response[i][0])
        
     
    # because depending on the action (keep or switch) the num of actions we can take is different
    # hence, we check what the action we do is and declare the choices array accordingly
        
    # Make a random choice based on probabilities
    # k=1: Specifies that we want to make a single random choice.
    # [0] is used to extract the single element from that list
    random_choice = random.choices(choices, weights=probabilities, k=1)[0]
     
    next_state = response [random_choice][1] # get next state
    reward = response [random_choice][2]     # get reward
     
    return next_state,reward


# Phase 2
 Implementing Tubular Q-Learning

In [103]:
#==============================================================================================================================
#################### Q-Learning ################
#===== Hyperparameters ===================
# alpha -> Learning rate
# gamma -> Discount factor
# epsilon ->  # Exploration rate
# epsilon_decay -> Decay rate for epsilon
# min_epsilon -> Minimum epsilon value
# num_episodes -> Number of episodes

def implement_Q_learning(environment, num_of_episodes, alpha, gamma, epsilon_decay, alpha_decay, finding_parameters):
    Q = np.zeros((len(environment),len(environment[0])))
    epsilon = 1.0               # Exploration rate0
    #epsilon_decay = 0.99        # Decay rate for epsilon
    min_epsilon = 0.1           # Minimum epsilon value
    #alpha_decay = 0.01
    initial_alpha = alpha
    min_alpha = 0.001
    convergence_episode = float('inf')  # Initialize with a large number
    conv_counter = 0

    progress_bar = tqdm(range(num_of_episodes))
    for episode in progress_bar: 
        prev_Q = np.copy(Q)
        current_state = random.randint(0, len(environment)-1) # select a random starting state
        
        for _ in range(100):      # do 100 steps do get a feel for what happens in the environment
            # decide if we are going to explore or to exploit based on the epsilon value
            if random.uniform(0,1) < epsilon:
                # Explore by picking a random action
                action = random.choice([0,1])
            else:
                action = np.argmax(Q[current_state])

            next_state,reward = get_response(environment, current_state, action)
            
            Q[current_state,action] = Q[current_state,action] + alpha * (
                reward + gamma * np.max(Q[next_state]) - Q[current_state,action]
            )
            
            # update the current state
            current_state = next_state    
        # update the hyperparameters     
        epsilon = max(min_epsilon, epsilon * epsilon_decay)
        alpha = max(min_alpha, initial_alpha * np.exp(-alpha_decay * episode))
        
        
        if finding_parameters == True and check_q_table_convergence(prev_Q, Q, epsilon=0.00002):
            conv_counter += 1
            if conv_counter > 2:  # Adjust convergence criteria based on your problem
                # convergence_episode = min(convergence_episode, episode)
                convergence_episode = episode
                # print("prev_Q:", prev_Q)
                # print("Q:", Q)
                print("convergence_episode = ",convergence_episode)
                # print(np.argmax(Q,axis=1))
                conv_counter = 0
                break

    # print("\n",Q)
    return Q, convergence_episode



# environment = P2
# alpha = 0.5
# gamma = 0
# V_opt1,P_opt1,Q_opt = policy_iteration(environment,gamma)


# Define objective function for Optuna
# Optuna tries to minimise the output of the objective function by modifying the hyperparameters of the tubular q learning algorithm
# The output of the function will be the mse of the policy found at convergence summed up with the number of steps it took to converge
# Because finding a correct policy is more important then the number of steps, mse is (weighted) multiplied with 10^14 (so that it has greater impact on 
# the output of the objective function)
def objective(trial):    
    environment = P2  # Define your environment here
    num_of_episodes = 10000  # Adjust as needed
    alpha = trial.suggest_float('alpha', 0.5, 0.9, log=True)
    gamma = 0
    epsilon_decay = trial.suggest_float('epsilon_decay', 0.95, 0.999)
    alpha_decay = trial.suggest_float('alpha_decay', 0.001, 0.01)
    finding_parameters = True    
    Q, convergence_episode = implement_Q_learning(environment, num_of_episodes, alpha, gamma, epsilon_decay, alpha_decay, finding_parameters)
    print(np.argmax(Q,axis=1))
    
    # Return the inverse of convergence episode (maximize speed)
    convergence_episode = convergence_episode if convergence_episode != float('inf') else 10000
    r, mse = calculate_difference_and_mse(Q_opt, Q)   
    difference_count = sum(1 for x, y in zip(np.argmax(Q_opt,axis=1), np.argmax(Q,axis=1)) if x != y)
    result = mse * 10000000000000000 * (difference_count+1) + convergence_episode/10 
    print("mse: ",mse," result: ", result)
    return result


# def count_tables_differences(table1, table2):
#     if len(table1) != len(table2):
#         raise ValueError("Both tables must have the same length.")
    
#     difference_count = sum(1 for x, y in zip(table1, table2) if x != y)
#     return difference_count
    


# Implementing a Deep Q-Learning Neural Network 

In [104]:

####################____TASK3____########################################

# Define memory for Experience Replay
class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque([], maxlen=maxlen)
    
    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)

# Define model
class DQN(nn.Module):
    def __init__(self, in_states, h1_nodes, out_actions):
        super().__init__()

        # Define network layers
        self.fc1 = nn.Linear(in_states, h1_nodes)   # first fully connected layer
        self.out = nn.Linear(h1_nodes, out_actions) # output layer w

    def forward(self, x):
        x = F.relu(self.fc1(x)) # Apply rectified linear unit (ReLU) activation
        x = self.out(x)         # Calculate output
        return x



# Class That Implements Our Deep Q-Network
class stock_market_trading_DQN():    
    # HyperParameters
    alpha = 0.001              # Learning rate
    gamma = 0              # Discount Factor
    synching_period = 100    # After this many batches we synch the target nn with the policy nn
    replay_buffer_size = 10000 # Size of replay buffer
    min_batch_size = 64      # Size of each batch
    #optimizer = optim.Adam(q_network.parameters(), lr=0.001)

    # Define Huber as our loss function
    # loss_func = nn.SmoothL1Loss()
    loss_func = nn.MSELoss()
    optimizer = None
    ACTIONS = [0,1]
    num_actions = 2
    
    # Encode the input state 
    def state_to_dqn_input(self, state:int, num_states:int)->torch.Tensor:
        input_tensor = torch.zeros(num_states)
        input_tensor[state] = 1
        return input_tensor
            
    # This method is responsible to train our network based on a number of 'episodes'
    def train_DQN(self, episodes,environment,gamma,lr):
        P = environment
        num_of_states = len(P)
        num_of_actions = len(P[0])
        
        epsilon = 1 # Exploration rate
        self.gamma = gamma
        self.alpha = lr
        memory_buffer = ReplayMemory(self.replay_buffer_size)
        #memory_buffer = [[] for _ in range(self.replay_buffer_size)] 
        
        #memory_buffer[i % 1000] = [0,1,2,3]
        
        # Create policy and target network. Number of nodes in the hidden layer can be adjusted.
        # We create a NN with num of input nodes equal to the num of the total states 
        # The num of output layer nodes is equal to the num of the total actions
        # The hidden layer's num of nodes is equal to the num of states -> this is adjustable
        policy_dqn = DQN(in_states=num_of_states, h1_nodes=num_of_states, out_actions=num_of_actions)
        target_dqn = DQN(in_states=num_of_states, h1_nodes=num_of_states, out_actions=num_of_actions)

        # initialize the 2 networks to be the same 
        target_dqn.load_state_dict(policy_dqn.state_dict())

        # print('Policy (random, before training):')
        # self.print_dqn(policy_dqn)
        # print('===============================================================')
        # print('===============================================================')

        # optimizer = torch.optim.SGD(model.parameters(), lr=lr)
        
        # self.optimizer = torch.optim.RMSprop(policy_dqn.parameters(), lr=self.alpha, alpha=0.99, 
        #                                      eps=1e-08, weight_decay=0, momentum=0, centered=False)
        
        self.optimizer = torch.optim.Adam(policy_dqn.parameters(), lr=self.alpha)
        # optimizer = SGD([parameter], lr=0.1)
        
        # keep track of the reward at each round 
        reward_tracking = np.zeros(episodes)
        # List to keep track of epsilon decay
        epsilon_tracking = []
        synch_counter = 0 # which step we are on 
        
        progress_bar = tqdm(range(episodes))
        for i in progress_bar:
            current_state = random.randint(0, len(P)-1) # select a random starting state
        
            for _ in range(100):      # do 100 steps do get a feel for what happens in the environment
                # decide if we are going to explore or to exploit based on the epsilon value
                # if random.uniform(0,1) < epsilon:
                if random.random() < epsilon:
                    #action = np.random.binomial(1,0.5)     # Explore by picking a random action
                    action = random.choice([0,1])
                else:
                     # From the output layer, choose the node output (action) with the maximum value
                    with torch.no_grad():
                        action = policy_dqn(self.state_to_dqn_input(current_state, num_of_states)).argmax().item()
                    
                # get the response from the environment
                next_state,reward = get_response(P, current_state, action)
                # reward_tracking[i] = reward
                
                # Store the environments response into our memory        
                # memory_buffer[step % 1000] = [current_state, action, next_state, reward]
                memory_buffer.append((current_state, action, next_state, reward)) 
            
                # update the next state
                current_state = next_state    
            
                # Increment step counter
                synch_counter += 1
            
            # Perform the optimization
            if(len(memory_buffer) > self.min_batch_size):

                #mini_batch = self.sample_mem_buffer(memory_buffer, self.min_batch_size)
                mini_batch = memory_buffer.sample(self.min_batch_size)
                self.optimize(mini_batch, policy_dqn, target_dqn)        

                # Decay epsilon
                epsilon = max(epsilon - 1/episodes, 0)
                #epsilon = max(epsilon * 0.99, 0.1)

                # Copy policy network to target network after a certain number of steps
                ### CHECK
                # if (step % self.synching_period) == 0:
                if synch_counter > self.synching_period :
                # if (synch_counter  self.synching_period): 
                    target_dqn.load_state_dict(policy_dqn.state_dict())
                    synch_counter = 0

        # return the optimal policy
        #return policy_dqn.state_dict()
        torch.save(policy_dqn.state_dict(), "frozen_lake_dql.pt")
        return policy_dqn
                
    def optimize(self,mini_batch, policy_dqn, target_dqn):
        # Get number of input nodes
        num_states = policy_dqn.fc1.in_features

        current_q_list = []
        target_q_list = []

        for state, action, new_state, reward in mini_batch:
            # Calculate target q value 
            # We disable the gradient tracking for memory optimization
            with torch.no_grad():
                # Here we get the optimal output we SHOULD have gotten according to the target NN
                target = torch.FloatTensor(
                    # For DQNs the target NNs parameters are modified according to the equation
                    # Q[state,action] = reward + γ *max{Q[next_state]}
                    reward + self.gamma * target_dqn(self.state_to_dqn_input(new_state, num_states)).max()
                )
                    
            # Get the current set of Q values
            current_q = policy_dqn(self.state_to_dqn_input(state, num_states))
            current_q_list.append(current_q)

            # Get the target set of Q values
            target_q = target_dqn(self.state_to_dqn_input(state, num_states)) 

            # Adjust the specific action to the target that was just calculated
            target_q[action] = target
            target_q_list.append(target_q)

        # calculate the loss for all the batch  
        loss = self.loss_func(torch.stack(current_q_list), torch.stack(target_q_list))

        # Optimize the model by running back-propagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        
    # Test function
    def test_DQN(self, episodes,environment):
        # Create FrozenLake instance
        P = environment
        num_of_states = len(P)
        num_of_actions = len(P[0])

        # Load learned policy
        policy_dqn = DQN(in_states=num_of_states, h1_nodes=num_of_states, out_actions=num_of_actions) 
 
        policy_dqn.load_state_dict(torch.load("frozen_lake_dql.pt"))
        policy_dqn.eval()    # switch model to evaluation mode

        # print('Policy (trained):')
        # self.print_dqn(policy_dqn)

        for i in range(episodes):
            current_state = random.randint(0, num_of_states-1)

            for _ in range(100):
                # Select best action   
                with torch.no_grad():
                    action = policy_dqn(self.state_to_dqn_input(current_state, num_of_states)).argmax().item()
                # Execute action
                current_state,reward = get_response(P, current_state, action)

        
        
    def print_dqn(self, dqn):
        # Get number of input nodes
        num_states = dqn.fc1.in_features
        Q_table = np.zeros((num_states, self.num_actions))

        # Loop each state and print policy to console
        for s in range(num_states):

            q_values_element = dqn(self.state_to_dqn_input(s, num_states)).tolist()
            Q_table[s] = q_values_element
            
            #  Format q values for printing
            q_values = ''
            for q in dqn(self.state_to_dqn_input(s, num_states)).tolist():
                q_values += "{:+.2f}".format(q)+' '  # Concatenate q values, format to 2 decimals
            q_values=q_values.rstrip()              # Remove space at the end
            #

            # Map the best action
            best_action = dqn(self.state_to_dqn_input(s, num_states)).argmax()

            # Print policy in the format of: state, action, q values
            # The printed layout matches the FrozenLake map.
            print(f'{s:02},{best_action},[{q_values}]', end='\n')         
            if (s+1)%4==0:
                print() # Print a newline every 4 states
            
        #Q_table_transposed = [list(row) for row in zip(*Q_table)]
        return Q_table

# Choose Environment

In [105]:
environment = P2
#environment = generate_environment(3,0.001)
gamma = 0
#NN_learning_rate = 0.01

Find Optimal Policy (policy Iteration -> Ground Truth)

In [106]:
V_opt1,P_opt1,Q_opt = policy_iteration(environment,gamma)

Converged after 2 Policy Iterations


Run Tubular Q-Learning

In [108]:

# Create Optuna study

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=40)

# Print the best hyperparameters
print('Best hyperparameters: ', study.best_params)

optimal_alpha = study.best_params['alpha']
optimal_epsilon_decay = study.best_params['epsilon_decay']
optimal_alpha_decay = study.best_params['alpha_decay']


for i in range(10):
    environment = environment
    num_of_episodes = 20000
    alpha = optimal_alpha
    #gamma = 0
    epsilon_decay = optimal_epsilon_decay
    alpha_decay = optimal_alpha_decay
    finding_parameters =  False
    Q_tubular,_ = implement_Q_learning(environment, num_of_episodes, alpha, gamma, epsilon_decay, alpha_decay, finding_parameters)
    print(f"\n {i} FINAL OPTIMAL POLICY {np.argmax(Q_tubular,axis=1)}")


#Q_tubular = implement_Q_learning(environment, num_of_episodes, alpha, gamma)

[I 2024-07-18 16:48:34,242] A new study created in memory with name: no-name-00ec19ad-3a91-4ac9-8f2d-68ab1d0da562
100%|██████████| 10000/10000 [00:08<00:00, 1122.29it/s]
[I 2024-07-18 16:48:43,159] Trial 0 finished with value: 5474045722.588112 and parameters: {'alpha': 0.6755170782687087, 'epsilon_decay': 0.9616651564373581, 'alpha_decay': 0.009343148154535276}. Best is trial 0 with value: 5474045722.588112.


[0 0 0 0 1 0 1 0]
mse:  5.474044722588112e-07  result:  5474045722.588112


100%|██████████| 10000/10000 [00:09<00:00, 1098.88it/s]
[I 2024-07-18 16:48:52,263] Trial 1 finished with value: 6209106436.590572 and parameters: {'alpha': 0.8015357949880814, 'epsilon_decay': 0.9870713210486525, 'alpha_decay': 0.0018496796489528445}. Best is trial 0 with value: 5474045722.588112.


[0 0 0 0 1 0 1 0]
mse:  6.209105436590573e-07  result:  6209106436.590572


100%|██████████| 10000/10000 [00:08<00:00, 1112.46it/s]
[I 2024-07-18 16:49:01,255] Trial 2 finished with value: 13682800902.582363 and parameters: {'alpha': 0.7317239780978271, 'epsilon_decay': 0.9911889429572313, 'alpha_decay': 0.007120597364075478}. Best is trial 0 with value: 5474045722.588112.


[0 0 0 0 1 0 1 0]
mse:  1.3682799902582363e-06  result:  13682800902.582363


100%|██████████| 10000/10000 [00:09<00:00, 1105.78it/s]
[I 2024-07-18 16:49:10,301] Trial 3 finished with value: 2271287290.887163 and parameters: {'alpha': 0.7324725546064613, 'epsilon_decay': 0.9870552260528945, 'alpha_decay': 0.0036062750683418495}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  2.271286290887163e-07  result:  2271287290.887163


100%|██████████| 10000/10000 [00:09<00:00, 1107.12it/s]
[I 2024-07-18 16:49:19,338] Trial 4 finished with value: 14249358952.733189 and parameters: {'alpha': 0.5209812701387704, 'epsilon_decay': 0.9839775274305202, 'alpha_decay': 0.0013792505837090865}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.424935795273319e-06  result:  14249358952.733189


100%|██████████| 10000/10000 [00:09<00:00, 1102.43it/s]
[I 2024-07-18 16:49:28,413] Trial 5 finished with value: 13776144304.15639 and parameters: {'alpha': 0.8651003363049525, 'epsilon_decay': 0.9594625070099438, 'alpha_decay': 0.009060346555406135}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.377614330415639e-06  result:  13776144304.15639


100%|██████████| 10000/10000 [00:09<00:00, 1100.40it/s]
[I 2024-07-18 16:49:37,504] Trial 6 finished with value: 8271835848.88615 and parameters: {'alpha': 0.6541050153996902, 'epsilon_decay': 0.9742443537996618, 'alpha_decay': 0.006527965054017043}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  8.271834848886151e-07  result:  8271835848.88615


100%|██████████| 10000/10000 [00:09<00:00, 1092.22it/s]
[I 2024-07-18 16:49:46,663] Trial 7 finished with value: 6909343682.073338 and parameters: {'alpha': 0.7854652494354548, 'epsilon_decay': 0.982427826053449, 'alpha_decay': 0.007194468341179152}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  6.909342682073338e-07  result:  6909343682.073338


100%|██████████| 10000/10000 [00:09<00:00, 1094.12it/s]
[I 2024-07-18 16:49:55,805] Trial 8 finished with value: 2591215280.046504 and parameters: {'alpha': 0.7002435775526317, 'epsilon_decay': 0.9836375673375694, 'alpha_decay': 0.0026869466380097552}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  2.591214280046504e-07  result:  2591215280.046504


100%|██████████| 10000/10000 [00:09<00:00, 1090.97it/s]
[I 2024-07-18 16:50:04,976] Trial 9 finished with value: 36921071491.19977 and parameters: {'alpha': 0.6738687075781138, 'epsilon_decay': 0.9823214788832735, 'alpha_decay': 0.0011783938500589822}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 1 1 0 1 0]
mse:  1.8460535245599883e-06  result:  36921071491.19977


100%|██████████| 10000/10000 [00:08<00:00, 1112.51it/s]
[I 2024-07-18 16:50:13,980] Trial 10 finished with value: 4792587910.266373 and parameters: {'alpha': 0.5506299086898168, 'epsilon_decay': 0.9983503079197527, 'alpha_decay': 0.0035964603883481506}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  4.792586910266373e-07  result:  4792587910.266373


100%|██████████| 10000/10000 [00:09<00:00, 1076.16it/s]
[I 2024-07-18 16:50:23,287] Trial 11 finished with value: 4725998214.099149 and parameters: {'alpha': 0.6210685300912411, 'epsilon_decay': 0.9732424976653498, 'alpha_decay': 0.004004355153826618}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  4.725997214099149e-07  result:  4725998214.099149


100%|██████████| 10000/10000 [00:08<00:00, 1120.74it/s]
[I 2024-07-18 16:50:32,224] Trial 12 finished with value: 7760766502.398536 and parameters: {'alpha': 0.5915144412869034, 'epsilon_decay': 0.9988466001573317, 'alpha_decay': 0.003620195692479408}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  7.760765502398536e-07  result:  7760766502.398536


100%|██████████| 10000/10000 [00:08<00:00, 1114.02it/s]
[I 2024-07-18 16:50:41,216] Trial 13 finished with value: 11514666719.259212 and parameters: {'alpha': 0.7279106369111319, 'epsilon_decay': 0.9691041053230696, 'alpha_decay': 0.004757680697272431}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.1514665719259212e-06  result:  11514666719.259212


100%|██████████| 10000/10000 [00:09<00:00, 1106.30it/s]
[I 2024-07-18 16:50:50,269] Trial 14 finished with value: 5354217467.473397 and parameters: {'alpha': 0.7349033145405838, 'epsilon_decay': 0.99098695163233, 'alpha_decay': 0.002788207238842538}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  5.354216467473398e-07  result:  5354217467.473397


100%|██████████| 10000/10000 [00:09<00:00, 1108.76it/s]
[I 2024-07-18 16:50:59,304] Trial 15 finished with value: 6901144268.511627 and parameters: {'alpha': 0.8865570417820525, 'epsilon_decay': 0.9777492070831925, 'alpha_decay': 0.005517396114096545}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  6.901143268511627e-07  result:  6901144268.511627


100%|██████████| 10000/10000 [00:09<00:00, 1110.34it/s]
[I 2024-07-18 16:51:08,325] Trial 16 finished with value: 11964312344.620453 and parameters: {'alpha': 0.7975221225928371, 'epsilon_decay': 0.9500208930604233, 'alpha_decay': 0.005511134695113453}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.1964311344620452e-06  result:  11964312344.620453


100%|██████████| 10000/10000 [00:09<00:00, 1088.72it/s]
[I 2024-07-18 16:51:17,527] Trial 17 finished with value: 6495474579.876577 and parameters: {'alpha': 0.6120776271541767, 'epsilon_decay': 0.9918147913755082, 'alpha_decay': 0.00249547344883495}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  6.495473579876577e-07  result:  6495474579.876577


100%|██████████| 10000/10000 [00:09<00:00, 1081.97it/s]
[I 2024-07-18 16:51:26,785] Trial 18 finished with value: 10684529911.333252 and parameters: {'alpha': 0.7098165656021567, 'epsilon_decay': 0.97864791349699, 'alpha_decay': 0.00275462464326396}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.0684528911333252e-06  result:  10684529911.333252


100%|██████████| 10000/10000 [00:09<00:00, 1094.07it/s]
[I 2024-07-18 16:51:35,940] Trial 19 finished with value: 7520992097.103741 and parameters: {'alpha': 0.8314788014470336, 'epsilon_decay': 0.967403579481462, 'alpha_decay': 0.004328942961505674}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  7.52099109710374e-07  result:  7520992097.103741


100%|██████████| 10000/10000 [00:09<00:00, 1082.64it/s]
[I 2024-07-18 16:51:45,193] Trial 20 finished with value: 40544939270.09443 and parameters: {'alpha': 0.6488601154062383, 'epsilon_decay': 0.9885668205901488, 'alpha_decay': 0.002104195178814751}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 1 1 0 1 0]
mse:  2.0272469135047216e-06  result:  40544939270.09443


100%|██████████| 10000/10000 [00:09<00:00, 1099.12it/s]
[I 2024-07-18 16:51:54,305] Trial 21 finished with value: 8171144845.2990265 and parameters: {'alpha': 0.6200038928242115, 'epsilon_decay': 0.971573951401323, 'alpha_decay': 0.0036346942076376587}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  8.171143845299026e-07  result:  8171144845.2990265


100%|██████████| 10000/10000 [00:09<00:00, 1100.52it/s]
[I 2024-07-18 16:52:03,410] Trial 22 finished with value: 8365662665.1353035 and parameters: {'alpha': 0.5676458254833232, 'epsilon_decay': 0.9771071008875569, 'alpha_decay': 0.004575915094612373}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  8.365661665135304e-07  result:  8365662665.1353035


100%|██████████| 10000/10000 [00:09<00:00, 1099.44it/s]
[I 2024-07-18 16:52:12,520] Trial 23 finished with value: 11750804890.401188 and parameters: {'alpha': 0.6905064708416484, 'epsilon_decay': 0.9943541302152051, 'alpha_decay': 0.003618259495071537}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.1750803890401188e-06  result:  11750804890.401188


100%|██████████| 10000/10000 [00:09<00:00, 1097.07it/s]
[I 2024-07-18 16:52:21,650] Trial 24 finished with value: 4278320087.372349 and parameters: {'alpha': 0.7549091728490224, 'epsilon_decay': 0.9851339484103627, 'alpha_decay': 0.003029443778680305}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  4.278319087372349e-07  result:  4278320087.372349


100%|██████████| 10000/10000 [00:09<00:00, 1106.17it/s]
[I 2024-07-18 16:52:30,708] Trial 25 finished with value: 10358225179.336845 and parameters: {'alpha': 0.7711750094725481, 'epsilon_decay': 0.9851785933981958, 'alpha_decay': 0.00296301088556353}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.0358224179336846e-06  result:  10358225179.336845


100%|██████████| 10000/10000 [00:09<00:00, 1082.83it/s]
[I 2024-07-18 16:52:39,957] Trial 26 finished with value: 7760258970.558503 and parameters: {'alpha': 0.7617200141961702, 'epsilon_decay': 0.9806479546049621, 'alpha_decay': 0.005055599117309975}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  7.760257970558503e-07  result:  7760258970.558503


100%|██████████| 10000/10000 [00:09<00:00, 1063.50it/s]
[I 2024-07-18 16:52:49,376] Trial 27 finished with value: 4480338410.711069 and parameters: {'alpha': 0.8382340164687628, 'epsilon_decay': 0.994426008941339, 'alpha_decay': 0.0018197860428761192}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  4.4803374107110687e-07  result:  4480338410.711069


100%|██████████| 10000/10000 [00:09<00:00, 1085.60it/s]
[I 2024-07-18 16:52:58,603] Trial 28 finished with value: 4996982597.3471365 and parameters: {'alpha': 0.7005927343598028, 'epsilon_decay': 0.9874638557012154, 'alpha_decay': 0.006210542498180434}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  4.996981597347137e-07  result:  4996982597.3471365


100%|██████████| 10000/10000 [00:09<00:00, 1091.23it/s]
[I 2024-07-18 16:53:07,784] Trial 29 finished with value: 4058221733.098794 and parameters: {'alpha': 0.653715426268994, 'epsilon_decay': 0.9855950320489446, 'alpha_decay': 0.0029905104043766923}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  4.058220733098794e-07  result:  4058221733.098794


100%|██████████| 10000/10000 [00:09<00:00, 1028.87it/s]
[I 2024-07-18 16:53:17,519] Trial 30 finished with value: 8407646647.2655 and parameters: {'alpha': 0.6620522583944497, 'epsilon_decay': 0.9798969290622858, 'alpha_decay': 0.008702056980949276}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  8.4076456472655e-07  result:  8407646647.2655


100%|██████████| 10000/10000 [00:10<00:00, 945.86it/s]
[I 2024-07-18 16:53:28,107] Trial 31 finished with value: 12916623701.98352 and parameters: {'alpha': 0.7404108557662823, 'epsilon_decay': 0.9849540673234835, 'alpha_decay': 0.0031375421253280167}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.291662270198352e-06  result:  12916623701.98352


100%|██████████| 10000/10000 [00:10<00:00, 940.28it/s]
[I 2024-07-18 16:53:38,761] Trial 32 finished with value: 14167408952.834766 and parameters: {'alpha': 0.6875715352543957, 'epsilon_decay': 0.988537590685324, 'alpha_decay': 0.002269273543364112}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.4167407952834767e-06  result:  14167408952.834766


100%|██████████| 10000/10000 [00:09<00:00, 1013.34it/s]
[I 2024-07-18 16:53:48,647] Trial 33 finished with value: 2867788773.655798 and parameters: {'alpha': 0.7589556373854621, 'epsilon_decay': 0.9941981214009106, 'alpha_decay': 0.0018194628515569894}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  2.867787773655798e-07  result:  2867788773.655798


100%|██████████| 10000/10000 [00:11<00:00, 907.20it/s]
[I 2024-07-18 16:53:59,687] Trial 34 finished with value: 9325784069.164402 and parameters: {'alpha': 0.7152398727003062, 'epsilon_decay': 0.991044903734177, 'alpha_decay': 0.0015620553400962373}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  9.325783069164401e-07  result:  9325784069.164402


100%|██████████| 10000/10000 [00:11<00:00, 906.00it/s]
[I 2024-07-18 16:54:10,745] Trial 35 finished with value: 12596938402.937798 and parameters: {'alpha': 0.6370461448420656, 'epsilon_decay': 0.9962743708970175, 'alpha_decay': 0.0019478714528914744}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.2596937402937798e-06  result:  12596938402.937798


100%|██████████| 10000/10000 [00:10<00:00, 912.45it/s]
[I 2024-07-18 16:54:21,720] Trial 36 finished with value: 13044283001.97704 and parameters: {'alpha': 0.8127845950711262, 'epsilon_decay': 0.9939632473911367, 'alpha_decay': 0.0010376071124557995}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 0 1 0 1 0]
mse:  1.304428200197704e-06  result:  13044283001.97704


100%|██████████| 10000/10000 [00:11<00:00, 907.60it/s]
[I 2024-07-18 16:54:32,756] Trial 37 finished with value: 25635803665.62742 and parameters: {'alpha': 0.6755060456229536, 'epsilon_decay': 0.9885411315488872, 'alpha_decay': 0.0015881920370942133}. Best is trial 3 with value: 2271287290.887163.


[0 0 0 1 1 0 1 0]
mse:  1.2817901332813708e-06  result:  25635803665.62742


100%|██████████| 10000/10000 [00:11<00:00, 885.01it/s]
[I 2024-07-18 16:54:44,073] Trial 38 finished with value: 2267748228.5912256 and parameters: {'alpha': 0.7797582921883174, 'epsilon_decay': 0.9826151637519465, 'alpha_decay': 0.0024448201546440284}. Best is trial 38 with value: 2267748228.5912256.


[0 0 0 0 1 0 1 0]
mse:  2.2677472285912258e-07  result:  2267748228.5912256


100%|██████████| 10000/10000 [00:09<00:00, 1052.75it/s]
[I 2024-07-18 16:54:53,588] Trial 39 finished with value: 11731860064.788298 and parameters: {'alpha': 0.7803786484675476, 'epsilon_decay': 0.9820510645601895, 'alpha_decay': 0.0024243322826659197}. Best is trial 38 with value: 2267748228.5912256.


[0 0 0 0 1 0 1 0]
mse:  1.1731859064788298e-06  result:  11731860064.788298


100%|██████████| 10000/10000 [00:09<00:00, 1037.66it/s]
[I 2024-07-18 16:55:03,241] Trial 40 finished with value: 8647048122.058626 and parameters: {'alpha': 0.8519362336762333, 'epsilon_decay': 0.9756506191538832, 'alpha_decay': 0.004122385391079426}. Best is trial 38 with value: 2267748228.5912256.


[0 0 0 0 1 0 1 0]
mse:  8.647047122058627e-07  result:  8647048122.058626


100%|██████████| 10000/10000 [00:09<00:00, 1063.52it/s]
[I 2024-07-18 16:55:12,663] Trial 41 finished with value: 9826156082.27533 and parameters: {'alpha': 0.8164947552350101, 'epsilon_decay': 0.9833247885359823, 'alpha_decay': 0.003388268335062188}. Best is trial 38 with value: 2267748228.5912256.


[0 0 0 0 1 0 1 0]
mse:  9.82615508227533e-07  result:  9826156082.27533


100%|██████████| 10000/10000 [00:09<00:00, 1046.41it/s]
[I 2024-07-18 16:55:22,235] Trial 42 finished with value: 2003475618.3572237 and parameters: {'alpha': 0.7476087889120938, 'epsilon_decay': 0.9868703659399009, 'alpha_decay': 0.0024299698986052446}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  2.0034746183572238e-07  result:  2003475618.3572237


100%|██████████| 10000/10000 [00:09<00:00, 1064.39it/s]
[I 2024-07-18 16:55:31,645] Trial 43 finished with value: 9908398757.748167 and parameters: {'alpha': 0.7467360927705545, 'epsilon_decay': 0.9903438119025159, 'alpha_decay': 0.0013478393818409753}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  9.908397757748167e-07  result:  9908398757.748167


100%|██████████| 10000/10000 [00:09<00:00, 1035.15it/s]
[I 2024-07-18 16:55:41,322] Trial 44 finished with value: 10545501162.638216 and parameters: {'alpha': 0.7197341939008113, 'epsilon_decay': 0.9934040204373626, 'alpha_decay': 0.0024381520610650335}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  1.0545500162638215e-06  result:  10545501162.638216


100%|██████████| 10000/10000 [00:09<00:00, 1063.82it/s]
[I 2024-07-18 16:55:50,738] Trial 45 finished with value: 7996334674.0407505 and parameters: {'alpha': 0.7822684982816102, 'epsilon_decay': 0.9868136164447219, 'alpha_decay': 0.0019916432355377854}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  7.99633367404075e-07  result:  7996334674.0407505


100%|██████████| 10000/10000 [00:09<00:00, 1068.96it/s]
[I 2024-07-18 16:56:00,108] Trial 46 finished with value: 8675607377.012733 and parameters: {'alpha': 0.7597830926045082, 'epsilon_decay': 0.9807538396611302, 'alpha_decay': 0.0038547280543320623}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  8.675606377012733e-07  result:  8675607377.012733


100%|██████████| 10000/10000 [00:09<00:00, 1074.00it/s]
[I 2024-07-18 16:56:09,436] Trial 47 finished with value: 10954643947.170244 and parameters: {'alpha': 0.7972151310259208, 'epsilon_decay': 0.9966163950932045, 'alpha_decay': 0.0026695797857088865}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  1.0954642947170244e-06  result:  10954643947.170244


100%|██████████| 10000/10000 [00:09<00:00, 1072.55it/s]
[I 2024-07-18 16:56:18,780] Trial 48 finished with value: 15564805025.77534 and parameters: {'alpha': 0.7355382995004838, 'epsilon_decay': 0.9831227246473869, 'alpha_decay': 0.009663692653706079}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  1.5564804025775339e-06  result:  15564805025.77534


100%|██████████| 10000/10000 [00:09<00:00, 1048.39it/s]
[I 2024-07-18 16:56:28,335] Trial 49 finished with value: 8439252149.193464 and parameters: {'alpha': 0.8749467166295075, 'epsilon_decay': 0.9592512522197403, 'alpha_decay': 0.001689613097823854}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  8.439251149193464e-07  result:  8439252149.193464


100%|██████████| 10000/10000 [00:09<00:00, 1049.09it/s]
[I 2024-07-18 16:56:37,882] Trial 50 finished with value: 9531178154.346401 and parameters: {'alpha': 0.6998388210742243, 'epsilon_decay': 0.9894683554329893, 'alpha_decay': 0.008272175428650958}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  9.5311771543464e-07  result:  9531178154.346401


100%|██████████| 10000/10000 [00:09<00:00, 1050.74it/s]
[I 2024-07-18 16:56:47,418] Trial 51 finished with value: 13428116161.24931 and parameters: {'alpha': 0.6854556759225529, 'epsilon_decay': 0.9867032306858086, 'alpha_decay': 0.0032630231048743335}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  1.342811516124931e-06  result:  13428116161.24931


100%|██████████| 10000/10000 [00:09<00:00, 1055.55it/s]
[I 2024-07-18 16:56:56,908] Trial 52 finished with value: 12011074706.702408 and parameters: {'alpha': 0.639725341073817, 'epsilon_decay': 0.9855678635535496, 'alpha_decay': 0.0022634201552738254}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  1.2011073706702407e-06  result:  12011074706.702408


100%|██████████| 10000/10000 [00:09<00:00, 1062.39it/s]
[I 2024-07-18 16:57:06,337] Trial 53 finished with value: 5235548631.447423 and parameters: {'alpha': 0.7357050267289268, 'epsilon_decay': 0.9925741571508485, 'alpha_decay': 0.002845879504012068}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  5.235547631447423e-07  result:  5235548631.447423


100%|██████████| 10000/10000 [00:09<00:00, 1046.43it/s]
[I 2024-07-18 16:57:15,913] Trial 54 finished with value: 8306377304.107523 and parameters: {'alpha': 0.6031618410611292, 'epsilon_decay': 0.978755961390728, 'alpha_decay': 0.0010008507796159158}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  8.306376304107523e-07  result:  8306377304.107523


100%|██████████| 10000/10000 [00:09<00:00, 1055.80it/s]
[I 2024-07-18 16:57:25,401] Trial 55 finished with value: 6600820398.689061 and parameters: {'alpha': 0.7170918109408346, 'epsilon_decay': 0.9817880460088574, 'alpha_decay': 0.0033683776519774925}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  6.600819398689062e-07  result:  6600820398.689061


100%|██████████| 10000/10000 [00:09<00:00, 1054.31it/s]
[I 2024-07-18 16:57:34,903] Trial 56 finished with value: 5592311071.204709 and parameters: {'alpha': 0.670824334288777, 'epsilon_decay': 0.9834491074860131, 'alpha_decay': 0.0051071299493256036}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  5.592310071204709e-07  result:  5592311071.204709


100%|██████████| 10000/10000 [00:09<00:00, 1057.93it/s]
[I 2024-07-18 16:57:44,372] Trial 57 finished with value: 6447646868.246962 and parameters: {'alpha': 0.5153472411287797, 'epsilon_decay': 0.9757806847979248, 'alpha_decay': 0.0026666555908850306}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  6.447645868246962e-07  result:  6447646868.246962


100%|██████████| 10000/10000 [00:09<00:00, 1057.13it/s]
[I 2024-07-18 16:57:53,851] Trial 58 finished with value: 36314549815.73017 and parameters: {'alpha': 0.7607319546798421, 'epsilon_decay': 0.9968057077576956, 'alpha_decay': 0.004217926262379087}. Best is trial 42 with value: 2003475618.3572237.


[0 1 0 0 1 0 1 0]
mse:  1.8157274407865087e-06  result:  36314549815.73017


100%|██████████| 10000/10000 [00:09<00:00, 1048.51it/s]
[I 2024-07-18 16:58:03,406] Trial 59 finished with value: 8730224985.20589 and parameters: {'alpha': 0.7923214846192019, 'epsilon_decay': 0.9869580462394187, 'alpha_decay': 0.0020325548311026036}. Best is trial 42 with value: 2003475618.3572237.


[0 0 0 0 1 0 1 0]
mse:  8.730223985205891e-07  result:  8730224985.20589
Best hyperparameters:  {'alpha': 0.7476087889120938, 'epsilon_decay': 0.9868703659399009, 'alpha_decay': 0.0024299698986052446}


100%|██████████| 20000/20000 [00:18<00:00, 1074.93it/s]



 0 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 20000/20000 [00:18<00:00, 1069.85it/s]



 1 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 20000/20000 [00:18<00:00, 1079.88it/s]



 2 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 20000/20000 [00:18<00:00, 1070.92it/s]



 3 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 20000/20000 [00:18<00:00, 1076.71it/s]



 4 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 20000/20000 [00:18<00:00, 1088.42it/s]



 5 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 20000/20000 [00:18<00:00, 1080.76it/s]



 6 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 20000/20000 [00:18<00:00, 1077.47it/s]



 7 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 20000/20000 [00:19<00:00, 1045.34it/s]



 8 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 20000/20000 [00:18<00:00, 1064.22it/s]


 9 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]





 Run the DQN for the environment 

In [None]:
num_of_episodes = 10000
NN_learning_rate = 0.01
dql = stock_market_trading_DQN()
optimal_network = dql.train_DQN(num_of_episodes,environment,gamma,NN_learning_rate)
dql.test_DQN(10,environment)  

100%|██████████| 10000/10000 [03:01<00:00, 55.20it/s]


# Comparing Optimal Policies Generated from diffrent algorithms

In [None]:
# Phase 1 Optimal Policy
print("Phase 1 Optimal Policy")
print_policy(P_opt1,len(environment))
print("\nOptimal Q = ",Q_opt)
print("================================================================")
# Phase 2 - Tabular Q-Learning Optimal Policy
# print("Phase 2 - Tubular Q-Learning Optimal Policy")
# print(np.argmax(Q_tubular,axis=1))
# print("================================================================")

# Phase 2 - DQN Optimal Policy
print("Phase 2 - DQN Optimal Policy")
Q_NN = dql.print_dqn(optimal_network)
print("================================================================")

# Output difference
# print("\nDifference With Tabular")
# difference,total_error = calculate_difference_and_mse(Q_opt,Q_tubular)
# print(f"difference {difference}\nTotal Error: {total_error}")

# Output difference
print("\nDifference With NN")
print(Q_NN)
difference,total_error = calculate_difference_and_mse(Q_opt,Q_NN)
print(f"difference {difference}\nTotal Error: {total_error}")



Phase 1 Optimal Policy
State 0: Action 0
State 1: Action 1
State 2: Action 1
State 3: Action 1
State 4: Action 1
State 5: Action 0
State 6: Action 1
State 7: Action 1
State 8: Action 0
State 9: Action 1
State 10: Action 0
State 11: Action 1
State 12: Action 1
State 13: Action 1
State 14: Action 1
State 15: Action 0
State 16: Action 1
State 17: Action 1
State 18: Action 1
State 19: Action 1
State 20: Action 1
State 21: Action 1
State 22: Action 1
State 23: Action 1

Optimal Q =  [[0.03031047 0.02815876]
 [0.02638227 0.13133163]
 [0.04941365 0.05844365]
 [0.01009096 0.02041519]
 [0.01933133 0.10841806]
 [0.08369228 0.0101682 ]
 [0.02019804 0.0914925 ]
 [0.06360484 0.13300494]
 [0.06827976 0.03619543]
 [0.04675142 0.10075909]
 [0.01271144 0.01238558]
 [0.05064312 0.10624941]
 [0.00558377 0.09212666]
 [0.07747172 0.12413358]
 [0.00161899 0.12226172]
 [0.07114383 0.03561536]
 [0.05897908 0.07234668]
 [0.04094774 0.10078331]
 [0.02838915 0.076867  ]
 [0.00857494 0.04552349]
 [0.03768553 0.06