In [62]:
# # **Project 2: Stock Portfolio Optimization - Assignment 3**
# Athanasakis Evangelos 2019030118 // Fragkogiannis Yiorgos 2019030039


# Importing libraries


import numpy as np
import tkinter as tk #loads standard python GUI libraries
import random
from tkinter import *
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
from collections import deque
from tqdm import tqdm
import optuna




# Environments

In [63]:
#-------------------------------__________________Environments___________________-------------------------------------------------------------------------
# Generating environments


# Create the three different environments
# We are modeling this environment using 8 states in the format: {stock_currently_holding,state_of_stock_1,state_of_stock_2}

action_keep = 0     # keep the same stock
action_switch = 1   # switch to the other stock

# This environment is used for the question 1 where we need to demonstrate that the optimal
# policy is always to stay with the stock we already have invested
fee = -0.9
# r1H = 2*r2L
# in this case r1.h=0.1 // r2.H= 0.05 // r1.L = -0.02 // r2.L = 0.01
# we have used a large transaction fee so that the best policy will always be to keep using the same stock
P1 = {

    # State {1,L,L}
    0:{
        action_keep: [
             (9/20, 0, -0.02),    # probability: 9/20, next_State: {1,L,L}, Reward: -0.02
             (1/20, 1, -0.02),    # {1,L,H}
             (9/20, 2, +0.1),     # {1,H,L}
             (1/20, 3, +0.1)      # {1,H,H}
        ],

        action_switch:[
            (9/20, 4, +0.01 + fee),    # {2,L,L}
            (1/20, 5, +0.05 + fee),    # {2,L,H}
            (9/20, 6, +0.01 + fee),    # {2,H,L}
            (1/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,L,H}
    1:{
        action_keep: [
             (1/20, 0, -0.02),  # {1,L,L}
             (9/20, 1, -0.02),  # {1,L,H}
             (1/20, 2, +0.1 ),  # {1,H,L}
             (9/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch:[
            (1/20, 4, +0.01 + fee),    # {2,L,L}
            (9/20, 5, +0.05 + fee),    # {2,L,H}
            (1/20, 6, +0.01 + fee),    # {2,H,L}
            (9/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,H,L}
    2:{
        action_keep: [
             (9/20, 0, -0.02),  # {1,L,L}
             (1/20, 1, -0.02),  # {1,L,H}
             (9/20, 2, +0.1 ),  # {1,H,L}
             (1/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch:[
            (9/20, 4, +0.01 + fee),    # {2,L,L}
            (1/20, 5, +0.05 + fee),    # {2,L,H}
            (9/20, 6, +0.01 + fee),    # {2,H,L}
            (1/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,H,H}
    3:{
        action_keep: [
             (1/20, 0, -0.02),  # {1,L,L}
             (9/20, 1, -0.02),  # {1,L,H}
             (1/20, 2, +0.1 ),  # {1,H,L}
             (9/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch: [
            (1/20, 4, +0.01 + fee),    # {2,L,L}
            (9/20, 5, +0.05 + fee),    # {2,L,H}
            (1/20, 6, +0.01 + fee),    # {2,H,L}
            (9/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {2,L,L}
    4:{
        action_keep: [
             (9/20, 4,  +0.01),    # {2,L,L}
             (1/20, 5,  +0.05),    # {2,L,H}
             (9/20, 6,  +0.01),    # {2,H,L}
             (1/20, 7,  +0.05)     # {2,H,H}
        ],

        action_switch:[
             (9/20, 0, -0.02 + fee),  # {1,L,L}
             (1/20, 1, -0.02 + fee),  # {1,L,H}
             (9/20, 2, +0.1  + fee),  # {1,H,L}
             (1/20, 3, +0.1  + fee)   # {1,H,H}
        ]
    },

    # State {2,L,H}
    5:{
        action_keep: [
             (1/20, 4, +0.01),    # {2,L,L}
             (9/20, 5, +0.05),    # {2,L,H}
             (1/20, 6, +0.01),    # {2,H,L}
             (9/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
            (1/20, 0, -0.02 + fee),  # {1,L,L}
            (9/20, 1, -0.02 + fee),  # {1,L,H}
            (1/20, 2, +0.1  + fee),  # {1,H,L}
            (9/20, 3, +0.1  + fee)   # {1,H,H}
        ]
    },

    # State {2,H,L}
    6:{
        action_keep: [
             (9/20, 4, +0.01),    # {2,L,L}
             (1/20, 5, +0.05),    # {2,L,H}
             (9/20, 6, +0.01),    # {2,H,L}
             (1/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
             (9/20, 0, -0.02 + fee),  # {1,L,L}
             (1/20, 1, -0.02 + fee),  # {1,L,H}
             (9/20, 2, +0.1  + fee),  # {1,H,L}
             (1/20, 3, +0.1  + fee)   # {1,H,H}
        ]
    },

    # State {2,H,H}
    7:{
        action_keep: [
             (1/20, 4, +0.01),    # {2,L,L}
             (9/20, 5, +0.05),    # {2,L,H}
             (1/20, 6, +0.01),    # {2,H,L}
             (9/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
             (1/20, 0, -0.02 + fee),  # {1,L,L}
             (9/20, 1, -0.02 + fee),  # {1,L,H}
             (1/20, 2, +0.1  + fee),  # {1,H,L}
             (9/20, 3, +0.1  + fee)   # {1,H,H}
        ]
    }

}


# This environment implements the stocks environment from the midterm
# It is used for the question 2 where we need to demonstrate that the optimal policy
# for some of the states is to switch and in some others to stay
fee = -0.01
P2 = {

    # State {1,L,L}
    0:{
        action_keep: [
             (9/20, 0, -0.02),    # probability: 9/20, next_State: {1,L,L}, Reward: -0.02
             (1/20, 1, -0.02),    # {1,L,H}
             (9/20, 2, +0.1),     # {1,H,L}
             (1/20, 3, +0.1)      # {1,H,H}
        ],

        action_switch:[
            (9/20, 4, +0.01 + fee),    # {2,L,L}
            (1/20, 5, +0.05 + fee),    # {2,L,H}
            (9/20, 6, +0.01 + fee),    # {2,H,L}
            (1/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,L,H}
    1:{
        action_keep: [
             (1/20, 0, -0.02),  # {1,L,L}
             (9/20, 1, -0.02),  # {1,L,H}
             (1/20, 2, +0.1 ),  # {1,H,L}
             (9/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch:[
            (1/20, 4, +0.01 + fee),    # {2,L,L}
            (9/20, 5, +0.05 + fee),    # {2,L,H}
            (1/20, 6, +0.01 + fee),    # {2,H,L}
            (9/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,H,L}
    2:{
        action_keep: [
             (9/20, 0, -0.02),  # {1,L,L}
             (1/20, 1, -0.02),  # {1,L,H}
             (9/20, 2, +0.1 ),  # {1,H,L}
             (1/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch:[
            (9/20, 4, +0.01 + fee),    # {2,L,L}
            (1/20, 5, +0.05 + fee),    # {2,L,H}
            (9/20, 6, +0.01 + fee),    # {2,H,L}
            (1/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {1,H,H}
    3:{
        action_keep: [
             (1/20, 0, -0.02),  # {1,L,L}
             (9/20, 1, -0.02),  # {1,L,H}
             (1/20, 2, +0.1 ),  # {1,H,L}
             (9/20, 3, +0.1 )   # {1,H,H}
        ],

        action_switch: [
            (1/20, 4, +0.01 + fee),    # {2,L,L}
            (9/20, 5, +0.05  + fee),    # {2,L,H}
            (1/20, 6, +0.01 + fee),    # {2,H,L}
            (9/20, 7, +0.05 + fee)     # {2,H,H}
        ]
    },

    # State {2,L,L}
    4:{
        action_keep: [
             (9/20, 4,  +0.01),    # {2,L,L}
             (1/20, 5,  +0.05),    # {2,L,H}
             (9/20, 6,  +0.01),    # {2,H,L}
             (1/20, 7,  +0.05)     # {2,H,H}
        ],

        action_switch:[
             (9/20, 0, -0.02 + fee),  # {1,L,L}
             (1/20, 1, -0.02 + fee),  # {1,L,H}
             (9/20, 2, +0.1 + fee),  # {1,H,L}
             (1/20, 3, +0.1 + fee)   # {1,H,H}
        ]
    },

    # State {2,L,H}
    5:{
        action_keep: [
             (1/20, 4, +0.01),    # {2,L,L}
             (9/20, 5, +0.05),    # {2,L,H}
             (1/20, 6, +0.01),    # {2,H,L}
             (9/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
            (1/20, 0, -0.02 + fee),  # {1,L,L}
            (9/20, 1, -0.02 + fee),  # {1,L,H}
            (1/20, 2, +0.1 + fee),  # {1,H,L}
            (9/20, 3, +0.1 + fee)   # {1,H,H}
        ]
    },

    # State {2,H,L}
    6:{
        action_keep: [
             (9/20, 4, +0.01),    # {2,L,L}
             (1/20, 5, +0.05),    # {2,L,H}
             (9/20, 6, +0.01),    # {2,H,L}
             (1/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
             (9/20, 0, -0.02 + fee),  # {1,L,L}
             (1/20, 1, -0.02 + fee),  # {1,L,H}
             (9/20, 2, +0.1 + fee),  # {1,H,L}
             (1/20, 3, +0.1 + fee)   # {1,H,H}
        ]
    },

    # State {2,H,H}
    7:{
        action_keep: [
             (1/20, 4, +0.01),    # {2,L,L}
             (9/20, 5, +0.05),    # {2,L,H}
             (1/20, 6, +0.01),    # {2,H,L}
             (9/20, 7, +0.05)     # {2,H,H}
        ],

        action_switch:[
             (1/20, 0, -0.02 + fee),  # {1,L,L}
             (9/20, 1, -0.02 + fee),  # {1,L,H}
             (1/20, 2, +0.1 + fee),  # {1,H,L}
             (9/20, 3, +0.1 + fee)   # {1,H,H}
        ]
    }

}


# This environment implements the generic scenario of question 3 where for every stock
# ri_H,ri_L are chosen uniformly in [-0.02, 0.1] and transition probabilities pi_HL, pi_LH
# are equal to 0.1 for half the stocks and 0.5 for the other half.

# Since every stock can have two price states, the number of total states in the MDP
# we are creating will be = NumOfStoscks*2^numOfStocks


def decimal_to_binary_array(decimal, length):

    # Convert decimal to binary string (strip '0b' prefix)
    binary_string = bin(decimal)[2:]

    # Determine padding length
    padding_length = max(0, length - len(binary_string))

    # Pad binary string with leading zeros if needed
    padded_binary_string = '0' * padding_length + binary_string

    # Convert padded binary string to list of binary digits
    binary_array = [int(bit) for bit in padded_binary_string]

    return binary_array


# Function that generates the environment of N stocks dynamically, with a transaction fee
def generate_environment(N,fee):

    states_for_each_stock = 2**N
    total_states = N * states_for_each_stock
    max_state_length = N

    P = {}
    pi = []
    #Creating transition probabilities for the keep action
    #of EACH stock
    for i in range(0,N):
        if(i < N/2):
            # pi_HL = pi_LH = 0.1 | # pi_HH = pi_LL = 0.9
            row = [0.9,0.1,0.1,0.9] #[LL,LH,HL,HH]
        else:
            # pi_HL = pi_LH = 0.5 | # pi_HH = pi_LL = 0.5
            row = [0.5,0.5,0.5,0.5] #[LL,LH,HL,HH]
        pi.append(row)

    progress_bar = tqdm(range(0, total_states))
    for i in progress_bar:
        SubDictionary={}
        action_Keep = []
        action_Switch = []

        # find what stock we are reffering to
        # Stock ids start from 0
        stock = i // states_for_each_stock

        ##########################
        # We define states of L and H with binary ids
        # For example for 2 stocks this translation occurs:
        # LL -> 0,0 -> 0
        # LH -> 0,1 -> 1
        # HL -> 1,0 -> 2
        # HH -> 1,1 -> 3
        # The binary ids are then translated to decimals so that
        # we can use them in code
        ##########################

        current_state = i - stock * states_for_each_stock # find where this specific stock starts at the total_states environment
                                                          # this is necessary to calculate the transition probabilities

        # Convert decimal to binary string
        # Convert the binary string to a list of integers (0s and 1s)
        curr_state_array = decimal_to_binary_array(current_state, max_state_length)
        # We can now use the array to find if each stock is in high (1s) or low (0s) state
        # So We now know that we are at state {x,L,L,H....,H} with x the number of current stock

        #__Keep Stock ________________________________________________________________________________________________________________
        # progress_1 = tqdm(range (stock*2**N, ((stock+1)*2**N)))
        for j in range (stock*2**N, ((stock+1)*2**N)): # for every possible transition when keeping the same stock
            state_to_trans = j - stock * states_for_each_stock          # value (H or L) of all of the stocks at the state we will transition to, in decimal form (0,1,2,3...)
            trans_state_array = decimal_to_binary_array(state_to_trans, max_state_length) # convert to binary and take each bit separately (0 for L and 1 for H)

            transitionProb = 1

            for k in range(len(trans_state_array)):
                stock_state_trans = trans_state_array[k] # 0 or 1 // low or high
                stock_state_current = curr_state_array[k] # 0 or 1 // low or high

                if(stock_state_current == 0 and stock_state_trans == 0):       # Pi_LL
                    transitionProb = transitionProb * pi[stock][0]
                elif(stock_state_current == 0 and stock_state_trans == 1):     # pi_LH
                    transitionProb = transitionProb * pi[stock][1]
                elif(stock_state_current == 1 and stock_state_trans == 0):     # pi_HL
                    transitionProb = transitionProb * pi[stock][2]
                else:                                                          # pi_HH
                    transitionProb = transitionProb * pi[stock][3]

            nextState = j
            #reward = random.uniform(-0.02, 20)
            reward = random.uniform(-0.02, 0.1)
            action_Keep.append((transitionProb,nextState,reward))
        #-----------------------------------------------------------------------------------------------------------------------------------------------
        #fee = 0
        #__Switch Stock ________________________________________________________________________________________________________________
        # progress_bar = tqdm(range (0, total_states))
        for j in range (0, total_states): # for every possible transition when keeping the same stock
            trans_stock = j // states_for_each_stock

            if(trans_stock == stock):     # check if the transition stock is the same as the stock we start from
                continue                  # we have already handle this situation above so we move on


            trans_state = j - trans_stock * states_for_each_stock
            trans_state_array = decimal_to_binary_array(trans_state, max_state_length)
            transitionProb = 1

            for k in range(len(trans_state_array)):
                stock_state_trans = trans_state_array[k] # 0 or 1 // low or high
                stock_state_current = curr_state_array[k] # 0 or 1 // low or high

                if(stock_state_current == 0 and stock_state_trans == 0):       # Pi_LL
                    transitionProb = transitionProb * pi[stock][0]
                elif(stock_state_current == 0 and stock_state_trans == 1):     # pi_LH
                    transitionProb = transitionProb * pi[stock][1]
                elif(stock_state_current == 1 and stock_state_trans == 0):     # pi_HL
                    transitionProb = transitionProb * pi[stock][2]
                else:                                                          # pi_HH
                    transitionProb = transitionProb * pi[stock][3]

            nextState = j
            #reward = random.uniform(-0.02, 20) - fee
            reward = random.uniform(-0.02, 0.1) - fee
            action_Switch.append((transitionProb,nextState,reward))


        #-----------------------------------------------------------------------------------------------------------------------------------------------
        SubDictionary[action_keep] = action_Keep
        SubDictionary[action_switch] = action_Switch
        P[i]=SubDictionary



    return P



## Phase 1, Policy Evaluation/Iteration

In [64]:

def policy_evaluation(pi, P, gamma = 1.0, epsilon = 1e-10):  #inputs: (1) policy to be evaluated, (2) model of the environment (transition probabilities, etc., see previous cell), (3) discount factor (with default = 1), (4) convergence error (default = 10^{-10})
    #print("in policy EVALUATION")
    t = 0   #there's more elegant ways to do this
    prev_V = np.zeros(len(P)) # use as "cost-to-go", i.e. for V(s')
    while True:
        V = np.zeros(len(P)) # current value function to be learnerd
        for s in range(len(P)):  # do for every state
            for prob, next_state, reward in P[s][pi(s)]:  # calculate one Bellman step --> i.e., sum over all probabilities of transitions and reward for that state, the action suggested by the (fixed) policy, the reward earned (dictated by the model), and the cost-to-go from the next state (which is also decided by the model)
                V[s] = np.int64(V[s] + prob * (reward + gamma * prev_V[next_state]))
        if np.max(np.abs(prev_V - V)) < epsilon: #check if the new V estimate is close enough to the previous one;     
            break # if yes, finish loop
        prev_V = V.copy() #freeze the new values (to be used as the next V(s'))
        t += 1
    return V


def policy_improvement(V, P, gamma=1.0):  # takes a value function (as the cost to go V(s')), a model, and a discount parameter
    #print("in policy IMPROVEMENT")
    Q = np.zeros((len(P), len(P[0])), dtype=np.float64) #create a Q value array
    for s in range(len(P)):        # for every state in the environment/model
        for a in range(len(P[s])):  # and for every action in that state
            for prob, next_state, reward in P[s][a]:  #evaluate the action value based on the model and Value function given (which corresponds to the previous policy that we are trying to improve) 
                Q[s][a] += prob * (reward + gamma * V[next_state])
    new_pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]  # this basically creates the new (improved) policy by choosing at each state s the action a that has the highest Q value (based on the Q array we just calculated)
    # lambda is a "fancy" way of creating a function without formally defining it (e.g. simply to return, as here...or to use internally in another function)
    # you can implement this in a much simpler way, by using just a few more lines of code -- if this command is not clear, I suggest to try coding this yourself
    
    return new_pi,Q

# policy iteration is simple, it will call alternatively policy evaluation then policy improvement, till the policy converges.

def policy_iteration(P, gamma = 1.0, epsilon = 1e-10):
    t = 0
    random_actions = np.random.choice(tuple(P[0].keys()), len(P))     # start with random actions for each state  
    pi = lambda s: {s:a for s, a in enumerate(random_actions)}[s]     # and define your initial policy pi_0 based on these action (remember, we are passing policies around as python "functions", hence the need for this second line)
    #print("Policy in first iteration:")
    #print_policy(pi,len(P))
    #print("\n")
    while True:
        old_pi = {s: pi(s) for s in range(len(P))}  #keep the old policy to compare with new
        V = policy_evaluation(pi,P,gamma,epsilon)   #evaluate latest policy --> you receive its converged value function
        pi,Q_table = policy_improvement(V,P,gamma)          #get a better policy using the value function of the previous one just calculated 
        
        t += 1    
        if old_pi == {s:pi(s) for s in range(len(P))}: # you have converged to the optimal policy if the "improved" policy is exactly the same as in the previous step
            break
    print('Converged after %d Policy Iterations' %t) #keep track of the number of (outer) iterations to converge
    return V,pi,Q_table


# Function to print policy
def print_policy(policy, num_states=8):
    for s in range(num_states):
        print(f"State {s}: Action {policy(s)}")
     

Useful Functions for Tubular Qlearning and DQL

In [65]:
def calculate_difference_and_mse(q1, q2):
    # Ensure the tables have the same dimensions
    if len(q1) != len(q2) or any(len(row1) != len(row2) for row1, row2 in zip(q1, q2)):
        raise ValueError("Both tables must have the same dimensions.")
    
    result = []
    total_squared_error = 0
    num_elements = 0
    
    for row1, row2 in zip(q1, q2):
        row_diff = []
        for element1, element2 in zip(row1, row2):
            diff = element1 - element2
            row_diff.append(diff)
            total_squared_error += diff ** 2
            num_elements += 1
        result.append(row_diff)
    
    mse = total_squared_error / num_elements
    return result, mse


def check_q_table_convergence(prev_Q, current_Q, epsilon=0.001):
    """
    Checks if the Q-table has converged.

    Parameters:
    - prev_Q (np.ndarray): Previous Q-table.
    - current_Q (np.ndarray): Current Q-table.
    - epsilon (float): Convergence threshold.

    Returns:
    - bool: True if Q-table has converged, False otherwise.
    """
    if prev_Q is None:
        return False  # Cannot determine convergence without a previous Q-table
    
    # Calculate the maximum absolute difference between corresponding Q-values
    max_diff = np.max(np.abs(prev_Q - current_Q))
    
    # Check if the maximum difference is less than epsilon
    if max_diff < epsilon:
        return True  # Q-table has converged
    
    return False  # Q-table has not converged yet


# This function is used to simulate the environments response
# It gets as input the environment, the current state and the action that we have selected
# and it returns the next state and the reward
def get_response(environment, state, action):
    P = environment
    
    response = P[state][action] # get next states, transition probabilities and transaction rewards
                                # based on the current state and the action we want to make   

    # we use random.choices to get a random next state based on the weighted probabilities of the next states
    probabilities = []
    choices = range(len(P[state][action]))
    for i in range(len(P[state][action])): 
        probabilities.append(response[i][0])
        
     
    # because depending on the action (keep or switch) the num of actions we can take is different
    # hence, we check what the action we do is and declare the choices array accordingly
        
    # Make a random choice based on probabilities
    # k=1: Specifies that we want to make a single random choice.
    # [0] is used to extract the single element from that list
    random_choice = random.choices(choices, weights=probabilities, k=1)[0]
     
    next_state = response [random_choice][1] # get next state
    reward = response [random_choice][2]     # get reward
     
    return next_state,reward


# Phase 2
 Implementing Tubular Q-Learning

In [66]:
#==============================================================================================================================
#################### Q-Learning ################
#===== Hyperparameters ===================
# alpha -> Learning rate
# gamma -> Discount factor
# epsilon ->  # Exploration rate
# epsilon_decay -> Decay rate for epsilon
# min_epsilon -> Minimum epsilon value
# num_episodes -> Number of episodes

def implement_Q_learning(environment, num_of_episodes, alpha, gamma, epsilon_decay, alpha_decay, finding_parameters):
    Q = np.zeros((len(environment),len(environment[0])))
    epsilon = 1.0               # Exploration rate0
    #epsilon_decay = 0.99        # Decay rate for epsilon
    min_epsilon = 0.1           # Minimum epsilon value
    #alpha_decay = 0.01
    initial_alpha = alpha
    min_alpha = 0.001
    convergence_episode = float('inf')  # Initialize with a large number
    conv_counter = 0

    progress_bar = tqdm(range(num_of_episodes))
    for episode in progress_bar: 
        prev_Q = np.copy(Q)
        current_state = random.randint(0, len(environment)-1) # select a random starting state
        
        for _ in range(100):      # do 100 steps do get a feel for what happens in the environment
            # decide if we are going to explore or to exploit based on the epsilon value
            if random.uniform(0,1) < epsilon:
                # Explore by picking a random action
                action = random.choice([0,1])
            else:
                action = np.argmax(Q[current_state])

            next_state,reward = get_response(environment, current_state, action)
            
            Q[current_state,action] = Q[current_state,action] + alpha * (
                reward + gamma * np.max(Q[next_state]) - Q[current_state,action]
            )
            
            # update the current state
            current_state = next_state    
        # update the hyperparameters     
        epsilon = max(min_epsilon, epsilon * epsilon_decay)
        alpha = max(min_alpha, initial_alpha * np.exp(-alpha_decay * episode))
        
        
        if finding_parameters == True and check_q_table_convergence(prev_Q, Q, epsilon=0.000001):
            conv_counter += 1
            if conv_counter > 2:  # Adjust convergence criteria based on your problem
                # convergence_episode = min(convergence_episode, episode)
                convergence_episode = episode
                # print("prev_Q:", prev_Q)
                # print("Q:", Q)
                print("convergence_episode = ",convergence_episode)
                # print(np.argmax(Q,axis=1))
                conv_counter = 0
                break

    # print("\n",Q)
    return Q, convergence_episode



# environment = P2
# alpha = 0.5
# gamma = 0
# V_opt1,P_opt1,Q_opt = policy_iteration(environment,gamma)


# Define objective function for Optuna
# Optuna tries to minimise the output of the objective function by modifying the hyperparameters of the tubular q learning algorithm
# The output of the function will be the mse of the policy found at convergence summed up with the number of steps it took to converge
# Because finding a correct policy is more important then the number of steps, mse is (weighted) multiplied with 10^14 (so that it has greater impact on 
# the output of the objective function)


def objective(trial):    
    environment = P2  # Define your environment here
    num_of_episodes = 10000  # Adjust as needed
    alpha = trial.suggest_float('alpha', 0.5, 0.9, log=True)
    gamma = 0
    epsilon_decay = trial.suggest_float('epsilon_decay', 0.95, 0.999)
    alpha_decay = trial.suggest_float('alpha_decay', 0.001, 0.01)
    finding_parameters = True
    
    
    
    Q, convergence_episode = implement_Q_learning(environment, num_of_episodes, alpha, gamma, epsilon_decay, alpha_decay, finding_parameters)
    print(np.argmax(Q,axis=1))
    
    # Return the inverse of convergence episode (maximize speed)
    convergence_episode = convergence_episode if convergence_episode != float('inf') else 10000
    r, mse = calculate_difference_and_mse(Q_opt, Q)

    result = mse * 100000000000000000 + convergence_episode/10
    print("mse: ",mse," result: ", result)
    return result
    




# # Create Optuna study
# convergence_episode = float('inf')  # Initialize with a large number
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

# # Print the best hyperparameters
# print('Best hyperparameters: ', study.best_params)

# optimal_alpha = study.best_params['alpha']
# optimal_epsilon_decay = study.best_params['epsilon_decay']
# optimal_alpha_decay = study.best_params['alpha_decay']


# for i in range(20):
#     environment = P2
#     num_of_episodes = 10000
#     alpha = optimal_alpha
#     gamma = 0
#     epsilon_decay = optimal_epsilon_decay
#     alpha_decay = optimal_alpha_decay
#     finding_parameters =  False
#     Q,_ =implement_Q_learning(environment, num_of_episodes, alpha, gamma, epsilon_decay, alpha_decay, finding_parameters)
#     print(f"\n {i} FINAL OPTIMAL POLICY {np.argmax(Q,axis=1)}")

# Implementing a Deep Q-Learning Neural Network 

In [67]:

####################____TASK3____########################################

# Define memory for Experience Replay
class ReplayMemory():
    def __init__(self, maxlen):
        self.memory = deque([], maxlen=maxlen)
    
    def append(self, transition):
        self.memory.append(transition)

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)

# Define model
class DQN(nn.Module):
    def __init__(self, in_states, h1_nodes, out_actions):
        super().__init__()

        # Define network layers
        self.fc1 = nn.Linear(in_states, h1_nodes)   # first fully connected layer
        self.out = nn.Linear(h1_nodes, out_actions) # output layer w

    def forward(self, x):
        x = F.relu(self.fc1(x)) # Apply rectified linear unit (ReLU) activation
        x = self.out(x)         # Calculate output
        return x



# Class That Implements Our Deep Q-Network
class stock_market_trading_DQN():    
    # HyperParameters
    alpha = 0.001              # Learning rate
    gamma = 0              # Discount Factor
    synching_period = 100    # After this many batches we synch the target nn with the policy nn
    replay_buffer_size = 10000 # Size of replay buffer
    min_batch_size = 64      # Size of each batch
    #optimizer = optim.Adam(q_network.parameters(), lr=0.001)

    # Define Huber as our loss function
    # loss_func = nn.SmoothL1Loss()
    loss_func = nn.MSELoss()
    optimizer = None
    ACTIONS = [0,1]
    num_actions = 2
    
    # Encode the input state 
    def state_to_dqn_input(self, state:int, num_states:int)->torch.Tensor:
        input_tensor = torch.zeros(num_states)
        input_tensor[state] = 1
        return input_tensor
            
    # This method is responsible to train our network based on a number of 'episodes'
    def train_DQN(self, episodes,environment,gamma,lr):
        P = environment
        num_of_states = len(P)
        num_of_actions = len(P[0])
        
        epsilon = 1 # Exploration rate
        self.gamma = gamma
        self.alpha = lr
        memory_buffer = ReplayMemory(self.replay_buffer_size)
        #memory_buffer = [[] for _ in range(self.replay_buffer_size)] 
        
        #memory_buffer[i % 1000] = [0,1,2,3]
        
        # Create policy and target network. Number of nodes in the hidden layer can be adjusted.
        # We create a NN with num of input nodes equal to the num of the total states 
        # The num of output layer nodes is equal to the num of the total actions
        # The hidden layer's num of nodes is equal to the num of states -> this is adjustable
        policy_dqn = DQN(in_states=num_of_states, h1_nodes=num_of_states, out_actions=num_of_actions)
        target_dqn = DQN(in_states=num_of_states, h1_nodes=num_of_states, out_actions=num_of_actions)

        # initialize the 2 networks to be the same 
        target_dqn.load_state_dict(policy_dqn.state_dict())

        # print('Policy (random, before training):')
        # self.print_dqn(policy_dqn)
        # print('===============================================================')
        # print('===============================================================')

        # optimizer = torch.optim.SGD(model.parameters(), lr=lr)
        
        # self.optimizer = torch.optim.RMSprop(policy_dqn.parameters(), lr=self.alpha, alpha=0.99, 
        #                                      eps=1e-08, weight_decay=0, momentum=0, centered=False)
        
        self.optimizer = torch.optim.Adam(policy_dqn.parameters(), lr=self.alpha)
        # optimizer = SGD([parameter], lr=0.1)
        
        # keep track of the reward at each round 
        reward_tracking = np.zeros(episodes)
        # List to keep track of epsilon decay
        epsilon_tracking = []
        synch_counter = 0 # which step we are on 
        
        progress_bar = tqdm(range(episodes))
        for i in progress_bar:
            current_state = random.randint(0, len(P)-1) # select a random starting state
        
            for _ in range(100):      # do 100 steps do get a feel for what happens in the environment
                # decide if we are going to explore or to exploit based on the epsilon value
                # if random.uniform(0,1) < epsilon:
                if random.random() < epsilon:
                    #action = np.random.binomial(1,0.5)     # Explore by picking a random action
                    action = random.choice([0,1])
                else:
                     # From the output layer, choose the node output (action) with the maximum value
                    with torch.no_grad():
                        action = policy_dqn(self.state_to_dqn_input(current_state, num_of_states)).argmax().item()
                    
                # get the response from the environment
                next_state,reward = get_response(P, current_state, action)
                # reward_tracking[i] = reward
                
                # Store the environments response into our memory        
                # memory_buffer[step % 1000] = [current_state, action, next_state, reward]
                memory_buffer.append((current_state, action, next_state, reward)) 
            
                # update the next state
                current_state = next_state    
            
                # Increment step counter
                synch_counter += 1
            
            # Perform the optimization
            if(len(memory_buffer) > self.min_batch_size):

                #mini_batch = self.sample_mem_buffer(memory_buffer, self.min_batch_size)
                mini_batch = memory_buffer.sample(self.min_batch_size)
                self.optimize(mini_batch, policy_dqn, target_dqn)        

                # Decay epsilon
                epsilon = max(epsilon - 1/episodes, 0)
                #epsilon = max(epsilon * 0.99, 0.1)

                # Copy policy network to target network after a certain number of steps
                ### CHECK
                # if (step % self.synching_period) == 0:
                if synch_counter > self.synching_period :
                # if (synch_counter  self.synching_period): 
                    target_dqn.load_state_dict(policy_dqn.state_dict())
                    synch_counter = 0

        # return the optimal policy
        #return policy_dqn.state_dict()
        torch.save(policy_dqn.state_dict(), "frozen_lake_dql.pt")
        return policy_dqn
                
    def optimize(self,mini_batch, policy_dqn, target_dqn):
        # Get number of input nodes
        num_states = policy_dqn.fc1.in_features

        current_q_list = []
        target_q_list = []

        for state, action, new_state, reward in mini_batch:
            # Calculate target q value 
            # We disable the gradient tracking for memory optimization
            with torch.no_grad():
                # Here we get the optimal output we SHOULD have gotten according to the target NN
                target = torch.FloatTensor(
                    # For DQNs the target NNs parameters are modified according to the equation
                    # Q[state,action] = reward + γ *max{Q[next_state]}
                    reward + self.gamma * target_dqn(self.state_to_dqn_input(new_state, num_states)).max()
                )
                    
            # Get the current set of Q values
            current_q = policy_dqn(self.state_to_dqn_input(state, num_states))
            current_q_list.append(current_q)

            # Get the target set of Q values
            target_q = target_dqn(self.state_to_dqn_input(state, num_states)) 

            # Adjust the specific action to the target that was just calculated
            target_q[action] = target
            target_q_list.append(target_q)

        # calculate the loss for all the batch  
        loss = self.loss_func(torch.stack(current_q_list), torch.stack(target_q_list))

        # Optimize the model by running back-propagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        
    # Test function
    def test_DQN(self, episodes,environment):
        # Create FrozenLake instance
        P = environment
        num_of_states = len(P)
        num_of_actions = len(P[0])

        # Load learned policy
        policy_dqn = DQN(in_states=num_of_states, h1_nodes=num_of_states, out_actions=num_of_actions) 
 
        policy_dqn.load_state_dict(torch.load("frozen_lake_dql.pt"))
        policy_dqn.eval()    # switch model to evaluation mode

        # print('Policy (trained):')
        # self.print_dqn(policy_dqn)

        for i in range(episodes):
            current_state = random.randint(0, num_of_states-1)

            for _ in range(100):
                # Select best action   
                with torch.no_grad():
                    action = policy_dqn(self.state_to_dqn_input(current_state, num_of_states)).argmax().item()
                # Execute action
                current_state,reward = get_response(P, current_state, action)

        
        
    def print_dqn(self, dqn):
        # Get number of input nodes
        num_states = dqn.fc1.in_features
        Q_table = np.zeros((num_states, self.num_actions))

        # Loop each state and print policy to console
        for s in range(num_states):

            q_values_element = dqn(self.state_to_dqn_input(s, num_states)).tolist()
            Q_table[s] = q_values_element
            
            #  Format q values for printing
            q_values = ''
            for q in dqn(self.state_to_dqn_input(s, num_states)).tolist():
                q_values += "{:+.2f}".format(q)+' '  # Concatenate q values, format to 2 decimals
            q_values=q_values.rstrip()              # Remove space at the end
            #

            # Map the best action
            best_action = dqn(self.state_to_dqn_input(s, num_states)).argmax()

            # Print policy in the format of: state, action, q values
            # The printed layout matches the FrozenLake map.
            print(f'{s:02},{best_action},[{q_values}]', end='\n')         
            if (s+1)%4==0:
                print() # Print a newline every 4 states
            
        #Q_table_transposed = [list(row) for row in zip(*Q_table)]
        return Q_table

# Choose Environment

In [68]:
environment = P2
#environment = generate_environment(3,0.001)
gamma = 0
#NN_learning_rate = 0.01

Find Optimal Policy (policy Iteration -> Ground Truth)

In [69]:
V_opt1,P_opt1,Q_opt = policy_iteration(environment,gamma)

Converged after 2 Policy Iterations


Run Tubular Q-Learning

In [70]:

# Create Optuna study

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Print the best hyperparameters
print('Best hyperparameters: ', study.best_params)

optimal_alpha = study.best_params['alpha']
optimal_epsilon_decay = study.best_params['epsilon_decay']
optimal_alpha_decay = study.best_params['alpha_decay']


for i in range(10):
    environment = environment
    num_of_episodes = 10000
    alpha = optimal_alpha
    #gamma = 0
    epsilon_decay = optimal_epsilon_decay
    alpha_decay = optimal_alpha_decay
    finding_parameters =  False
    Q_tubular,_ = implement_Q_learning(environment, num_of_episodes, alpha, gamma, epsilon_decay, alpha_decay, finding_parameters)
    print(f"\n {i} FINAL OPTIMAL POLICY {np.argmax(Q_tubular,axis=1)}")


#Q_tubular = implement_Q_learning(environment, num_of_episodes, alpha, gamma)

[I 2024-07-18 13:22:06,318] A new study created in memory with name: no-name-4c4a7091-8df9-4bf4-a778-0af45dc2f20e
100%|██████████| 10000/10000 [00:06<00:00, 1561.11it/s]
[I 2024-07-18 13:22:12,727] Trial 0 finished with value: 71766273491.32463 and parameters: {'alpha': 0.5126233501925496, 'epsilon_decay': 0.9512042629408085, 'alpha_decay': 0.006213586036433987}. Best is trial 0 with value: 71766273491.32463.


[0 0 0 0 1 0 1 0]
mse:  7.176627249132464e-07  result:  71766273491.32463


100%|██████████| 10000/10000 [00:06<00:00, 1569.10it/s]
[I 2024-07-18 13:22:19,103] Trial 1 finished with value: 61625052612.13699 and parameters: {'alpha': 0.8619832005503137, 'epsilon_decay': 0.9711746254268878, 'alpha_decay': 0.005338052289157511}. Best is trial 1 with value: 61625052612.13699.


[0 0 0 0 1 0 1 0]
mse:  6.1625051612137e-07  result:  61625052612.13699


100%|██████████| 10000/10000 [00:06<00:00, 1480.81it/s]
[I 2024-07-18 13:22:25,859] Trial 2 finished with value: 87764473474.98055 and parameters: {'alpha': 0.6138232588285959, 'epsilon_decay': 0.9680773064595417, 'alpha_decay': 0.006914715358737683}. Best is trial 1 with value: 61625052612.13699.


[0 0 0 0 1 0 1 0]
mse:  8.776447247498054e-07  result:  87764473474.98055


100%|██████████| 10000/10000 [00:06<00:00, 1563.91it/s]
[I 2024-07-18 13:22:32,257] Trial 3 finished with value: 50667607873.658554 and parameters: {'alpha': 0.5463650717785404, 'epsilon_decay': 0.9830000333038463, 'alpha_decay': 0.009287394485765328}. Best is trial 3 with value: 50667607873.658554.


[0 0 0 0 1 0 1 0]
mse:  5.066760687365855e-07  result:  50667607873.658554


100%|██████████| 10000/10000 [00:06<00:00, 1575.25it/s]
[I 2024-07-18 13:22:38,607] Trial 4 finished with value: 139418852596.72775 and parameters: {'alpha': 0.5721511608866156, 'epsilon_decay': 0.988409308876485, 'alpha_decay': 0.0016520427886216038}. Best is trial 3 with value: 50667607873.658554.


[0 0 0 0 1 0 1 0]
mse:  1.3941885159672776e-06  result:  139418852596.72775


 31%|███       | 3096/10000 [00:01<00:04, 1624.14it/s]
[I 2024-07-18 13:22:40,517] Trial 5 finished with value: 1040350910667.4906 and parameters: {'alpha': 0.5447641364015612, 'epsilon_decay': 0.99148479962724, 'alpha_decay': 0.007829141728807092}. Best is trial 3 with value: 50667607873.658554.


convergence_episode =  3096
[0 1 0 1 1 0 1 0]
mse:  1.0403509103578906e-05  result:  1040350910667.4906


100%|██████████| 10000/10000 [00:06<00:00, 1479.55it/s]
[I 2024-07-18 13:22:47,277] Trial 6 finished with value: 48733958719.93359 and parameters: {'alpha': 0.6210208452257615, 'epsilon_decay': 0.9970255699095553, 'alpha_decay': 0.009915885267660247}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  4.873395771993359e-07  result:  48733958719.93359


100%|██████████| 10000/10000 [00:06<00:00, 1454.76it/s]
[I 2024-07-18 13:22:54,153] Trial 7 finished with value: 81097362517.42604 and parameters: {'alpha': 0.7004248706802763, 'epsilon_decay': 0.98670740614807, 'alpha_decay': 0.0035991769796037795}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  8.109736151742605e-07  result:  81097362517.42604


100%|██████████| 10000/10000 [00:06<00:00, 1518.31it/s]
[I 2024-07-18 13:23:00,743] Trial 8 finished with value: 60087125677.70764 and parameters: {'alpha': 0.6276865016552656, 'epsilon_decay': 0.9851943441505724, 'alpha_decay': 0.008298257727652834}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  6.008712467770764e-07  result:  60087125677.70764


 80%|████████  | 8013/10000 [00:05<00:01, 1537.45it/s]
[I 2024-07-18 13:23:05,958] Trial 9 finished with value: 172262481617.18936 and parameters: {'alpha': 0.8062676130040286, 'epsilon_decay': 0.9785383517106426, 'alpha_decay': 0.00503097404776898}. Best is trial 6 with value: 48733958719.93359.


convergence_episode =  8013
[0 0 0 0 1 0 1 0]
mse:  1.7226248081588937e-06  result:  172262481617.18936


100%|██████████| 10000/10000 [00:07<00:00, 1359.59it/s]
[I 2024-07-18 13:23:13,322] Trial 10 finished with value: 95570493598.06956 and parameters: {'alpha': 0.7128101891341083, 'epsilon_decay': 0.9956813119828645, 'alpha_decay': 0.009589087975883648}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  9.557049259806956e-07  result:  95570493598.06956


100%|██████████| 10000/10000 [00:06<00:00, 1463.51it/s]
[I 2024-07-18 13:23:20,164] Trial 11 finished with value: 55543253956.105354 and parameters: {'alpha': 0.5950936896874165, 'epsilon_decay': 0.9979535596462593, 'alpha_decay': 0.009822512807621898}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  5.554325295610535e-07  result:  55543253956.105354


100%|██████████| 10000/10000 [00:06<00:00, 1465.63it/s]
[I 2024-07-18 13:23:26,997] Trial 12 finished with value: 190797228232.52768 and parameters: {'alpha': 0.5008458581714456, 'epsilon_decay': 0.9781832114155827, 'alpha_decay': 0.008579126717297448}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  1.9079722723252767e-06  result:  190797228232.52768


100%|██████████| 10000/10000 [00:06<00:00, 1490.16it/s]
[I 2024-07-18 13:23:33,720] Trial 13 finished with value: 107954228210.29771 and parameters: {'alpha': 0.6627717098690339, 'epsilon_decay': 0.9623021101680932, 'alpha_decay': 0.009965525543335066}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  1.0795422721029772e-06  result:  107954228210.29771


 60%|██████    | 6034/10000 [00:03<00:02, 1557.03it/s]
[I 2024-07-18 13:23:37,605] Trial 14 finished with value: 78307467810.75363 and parameters: {'alpha': 0.5482022131491958, 'epsilon_decay': 0.9986957760842841, 'alpha_decay': 0.00724445521000156}. Best is trial 6 with value: 48733958719.93359.


convergence_episode =  6034
[0 0 0 0 1 0 1 0]
mse:  7.830746720735364e-07  result:  78307467810.75363


100%|██████████| 10000/10000 [00:06<00:00, 1555.60it/s]
[I 2024-07-18 13:23:44,042] Trial 15 finished with value: 258326072695.8418 and parameters: {'alpha': 0.7493692669713407, 'epsilon_decay': 0.9826005035842156, 'alpha_decay': 0.008655674159439516}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  2.583260716958418e-06  result:  258326072695.8418


 39%|███▉      | 3892/10000 [00:02<00:03, 1561.79it/s]
[I 2024-07-18 13:23:46,545] Trial 16 finished with value: 337540546463.0858 and parameters: {'alpha': 0.6612008747822472, 'epsilon_decay': 0.9920295353499909, 'alpha_decay': 0.0037514602630109657}. Best is trial 6 with value: 48733958719.93359.


convergence_episode =  3892
[0 1 0 1 1 0 1 0]
mse:  3.375405460738858e-06  result:  337540546463.0858


100%|██████████| 10000/10000 [00:07<00:00, 1338.19it/s]
[I 2024-07-18 13:23:54,028] Trial 17 finished with value: 87063324959.37524 and parameters: {'alpha': 0.5430596909288536, 'epsilon_decay': 0.9598677349585057, 'alpha_decay': 0.008995325690391628}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  8.706332395937524e-07  result:  87063324959.37524


100%|██████████| 10000/10000 [00:07<00:00, 1399.00it/s]
[I 2024-07-18 13:24:01,189] Trial 18 finished with value: 71300210342.9939 and parameters: {'alpha': 0.5869904006019935, 'epsilon_decay': 0.9802883643296367, 'alpha_decay': 0.007410229744498734}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  7.13002093429939e-07  result:  71300210342.9939


100%|██████████| 10000/10000 [00:06<00:00, 1590.93it/s]
[I 2024-07-18 13:24:07,485] Trial 19 finished with value: 112288236308.22891 and parameters: {'alpha': 0.6458410948893909, 'epsilon_decay': 0.9735862785133004, 'alpha_decay': 0.0017187645007909276}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  1.1228823530822892e-06  result:  112288236308.22891


100%|██████████| 10000/10000 [00:06<00:00, 1572.66it/s]
[I 2024-07-18 13:24:13,853] Trial 20 finished with value: 174722814039.0866 and parameters: {'alpha': 0.5690637720259859, 'epsilon_decay': 0.9919855591552037, 'alpha_decay': 0.006240311707149849}. Best is trial 6 with value: 48733958719.93359.


[0 1 0 0 1 0 1 0]
mse:  1.747228130390866e-06  result:  174722814039.0866


 46%|████▌     | 4600/10000 [00:03<00:03, 1387.45it/s]
[I 2024-07-18 13:24:17,187] Trial 21 finished with value: 62813402383.515236 and parameters: {'alpha': 0.6026622205294042, 'epsilon_decay': 0.9987597825716227, 'alpha_decay': 0.009961251066291992}. Best is trial 6 with value: 48733958719.93359.


convergence_episode =  4600
[0 0 0 0 1 0 1 0]
mse:  6.281340192351524e-07  result:  62813402383.515236


100%|██████████| 10000/10000 [00:07<00:00, 1418.87it/s]
[I 2024-07-18 13:24:24,246] Trial 22 finished with value: 67767067404.03467 and parameters: {'alpha': 0.5268787156676752, 'epsilon_decay': 0.9952071043907049, 'alpha_decay': 0.009212349464833396}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  6.776706640403467e-07  result:  67767067404.03467


 56%|█████▌    | 5621/10000 [00:04<00:03, 1339.25it/s]
[I 2024-07-18 13:24:28,459] Trial 23 finished with value: 314227978987.7215 and parameters: {'alpha': 0.5926989027463264, 'epsilon_decay': 0.989034091101865, 'alpha_decay': 0.00809115860339419}. Best is trial 6 with value: 48733958719.93359.


convergence_episode =  5621
[0 1 0 1 1 0 1 0]
mse:  3.1422797842562155e-06  result:  314227978987.7215


100%|██████████| 10000/10000 [00:06<00:00, 1541.14it/s]
[I 2024-07-18 13:24:34,957] Trial 24 finished with value: 220864885477.8533 and parameters: {'alpha': 0.6281097224683035, 'epsilon_decay': 0.9950866933204835, 'alpha_decay': 0.009264283265076105}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  2.208648844778533e-06  result:  220864885477.8533


100%|██████████| 10000/10000 [00:06<00:00, 1583.07it/s]
[I 2024-07-18 13:24:41,284] Trial 25 finished with value: 108224716490.13747 and parameters: {'alpha': 0.6996832329730948, 'epsilon_decay': 0.983659734406876, 'alpha_decay': 0.009183168100018936}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  1.0822471549013747e-06  result:  108224716490.13747


100%|██████████| 10000/10000 [00:06<00:00, 1512.20it/s]
[I 2024-07-18 13:24:47,906] Trial 26 finished with value: 88292032566.07265 and parameters: {'alpha': 0.5666059118363159, 'epsilon_decay': 0.9950690871056158, 'alpha_decay': 0.009890417623791684}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  8.829203156607264e-07  result:  88292032566.07265


 52%|█████▏    | 5202/10000 [00:03<00:02, 1633.54it/s]
[I 2024-07-18 13:24:51,102] Trial 27 finished with value: 257112189062.056 and parameters: {'alpha': 0.5234356510390632, 'epsilon_decay': 0.9664676561046498, 'alpha_decay': 0.007844029458850963}. Best is trial 6 with value: 48733958719.93359.


convergence_episode =  5202
[0 1 0 0 1 0 1 0]
mse:  2.57112188541856e-06  result:  257112189062.056


100%|██████████| 10000/10000 [00:06<00:00, 1543.79it/s]
[I 2024-07-18 13:24:57,588] Trial 28 finished with value: 61449523402.33128 and parameters: {'alpha': 0.6185918252324001, 'epsilon_decay': 0.975661850906743, 'alpha_decay': 0.004402785922631543}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  6.144952240233128e-07  result:  61449523402.33128


100%|██████████| 10000/10000 [00:06<00:00, 1500.44it/s]
[I 2024-07-18 13:25:04,270] Trial 29 finished with value: 222222197939.12976 and parameters: {'alpha': 0.5836109074077409, 'epsilon_decay': 0.9500872120808432, 'alpha_decay': 0.006496299512724289}. Best is trial 6 with value: 48733958719.93359.


[0 1 0 0 1 0 1 0]
mse:  2.2222219693912975e-06  result:  222222197939.12976


100%|██████████| 10000/10000 [00:06<00:00, 1628.11it/s]
[I 2024-07-18 13:25:10,421] Trial 30 finished with value: 67069006839.00229 and parameters: {'alpha': 0.7325949331785953, 'epsilon_decay': 0.9894495016861444, 'alpha_decay': 0.008751455565499867}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  6.706900583900229e-07  result:  67069006839.00229


100%|██████████| 10000/10000 [00:06<00:00, 1578.93it/s]
[I 2024-07-18 13:25:16,763] Trial 31 finished with value: 214623361007.6849 and parameters: {'alpha': 0.6364845482519836, 'epsilon_decay': 0.9848879352122459, 'alpha_decay': 0.008407482752995176}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 1 1 0 1 0]
mse:  2.146233600076849e-06  result:  214623361007.6849


100%|██████████| 10000/10000 [00:06<00:00, 1604.97it/s]
[I 2024-07-18 13:25:23,004] Trial 32 finished with value: 82317866897.54214 and parameters: {'alpha': 0.6775384082378101, 'epsilon_decay': 0.9820444200617546, 'alpha_decay': 0.009569828496741654}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  8.231786589754214e-07  result:  82317866897.54214


100%|██████████| 10000/10000 [00:06<00:00, 1632.04it/s]
[I 2024-07-18 13:25:29,143] Trial 33 finished with value: 95956374167.89621 and parameters: {'alpha': 0.6068056172927916, 'epsilon_decay': 0.9857847572145959, 'alpha_decay': 0.008117103931577729}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  9.59563731678962e-07  result:  95956374167.89621


100%|██████████| 10000/10000 [00:06<00:00, 1609.62it/s]
[I 2024-07-18 13:25:35,365] Trial 34 finished with value: 57647008999.64504 and parameters: {'alpha': 0.8872023038386873, 'epsilon_decay': 0.9923335350683508, 'alpha_decay': 0.00563035270684473}. Best is trial 6 with value: 48733958719.93359.


[0 0 0 0 1 0 1 0]
mse:  5.764700799964504e-07  result:  57647008999.64504


100%|██████████| 10000/10000 [00:06<00:00, 1662.72it/s]
[I 2024-07-18 13:25:41,389] Trial 35 finished with value: 23113262655.96678 and parameters: {'alpha': 0.8957711478356251, 'epsilon_decay': 0.9985025614789602, 'alpha_decay': 0.0026566966493291226}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  2.3113261655966783e-07  result:  23113262655.96678


 38%|███▊      | 3752/10000 [00:02<00:03, 1668.39it/s]
[I 2024-07-18 13:25:43,647] Trial 36 finished with value: 740961691295.5127 and parameters: {'alpha': 0.8420695800766033, 'epsilon_decay': 0.998698912446709, 'alpha_decay': 0.002250622184446817}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  3752
[0 1 0 1 1 0 1 0]
mse:  7.409616909203127e-06  result:  740961691295.5127


100%|██████████| 10000/10000 [00:06<00:00, 1605.35it/s]
[I 2024-07-18 13:25:49,888] Trial 37 finished with value: 38695908368.90979 and parameters: {'alpha': 0.560770479917866, 'epsilon_decay': 0.9570710517810763, 'alpha_decay': 0.002837473804273222}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  3.869590736890979e-07  result:  38695908368.90979


 48%|████▊     | 4829/10000 [00:03<00:03, 1549.03it/s]
[I 2024-07-18 13:25:53,014] Trial 38 finished with value: 75600572145.26419 and parameters: {'alpha': 0.8097558475976304, 'epsilon_decay': 0.9562186900140176, 'alpha_decay': 0.0028992010148653517}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  4829
[0 0 0 0 1 0 1 0]
mse:  7.56005716623642e-07  result:  75600572145.26419


100%|██████████| 10000/10000 [00:06<00:00, 1626.05it/s]
[I 2024-07-18 13:25:59,174] Trial 39 finished with value: 63336318636.165985 and parameters: {'alpha': 0.5642279040766915, 'epsilon_decay': 0.9695154251235576, 'alpha_decay': 0.001267907450094002}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  6.333631763616599e-07  result:  63336318636.165985


100%|██████████| 10000/10000 [00:06<00:00, 1517.29it/s]
[I 2024-07-18 13:26:05,774] Trial 40 finished with value: 68677544705.95722 and parameters: {'alpha': 0.5488113670362164, 'epsilon_decay': 0.9557846944637785, 'alpha_decay': 0.002859029666621893}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  6.867754370595722e-07  result:  68677544705.95722


100%|██████████| 10000/10000 [00:06<00:00, 1569.74it/s]
[I 2024-07-18 13:26:12,156] Trial 41 finished with value: 123569087928.04735 and parameters: {'alpha': 0.5287728153958718, 'epsilon_decay': 0.9961329348498515, 'alpha_decay': 0.002541181530345026}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.2356908692804735e-06  result:  123569087928.04735


100%|██████████| 10000/10000 [00:06<00:00, 1611.32it/s]
[I 2024-07-18 13:26:18,373] Trial 42 finished with value: 161639358908.17075 and parameters: {'alpha': 0.5038848034296975, 'epsilon_decay': 0.9651334980348374, 'alpha_decay': 0.0036327199541192237}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.6163935790817073e-06  result:  161639358908.17075


100%|██████████| 10000/10000 [00:06<00:00, 1585.46it/s]
[I 2024-07-18 13:26:24,689] Trial 43 finished with value: 55507622036.17486 and parameters: {'alpha': 0.575379882174602, 'epsilon_decay': 0.9883694963725064, 'alpha_decay': 0.004228701144808023}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  5.550762103617486e-07  result:  55507622036.17486


 47%|████▋     | 4675/10000 [00:03<00:03, 1548.86it/s]
[I 2024-07-18 13:26:27,721] Trial 44 finished with value: 395814291070.6846 and parameters: {'alpha': 0.5510753589688862, 'epsilon_decay': 0.9878161219937159, 'alpha_decay': 0.004336785172787358}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  4675
[0 1 0 0 1 0 1 0]
mse:  3.958142906031846e-06  result:  395814291070.6846


 35%|███▌      | 3545/10000 [00:02<00:04, 1601.03it/s]
[I 2024-07-18 13:26:29,946] Trial 45 finished with value: 729550720253.1141 and parameters: {'alpha': 0.7753474807621726, 'epsilon_decay': 0.9736821695729284, 'alpha_decay': 0.004249935859167447}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  3545
[0 1 0 1 1 0 1 0]
mse:  7.295507198986141e-06  result:  729550720253.1141


100%|██████████| 10000/10000 [00:06<00:00, 1612.97it/s]
[I 2024-07-18 13:26:36,158] Trial 46 finished with value: 65634230007.57737 and parameters: {'alpha': 0.5793735629422273, 'epsilon_decay': 0.9933749307548173, 'alpha_decay': 0.005151397503205957}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  6.563422900757737e-07  result:  65634230007.57737


100%|██████████| 10000/10000 [00:06<00:00, 1625.73it/s]
[I 2024-07-18 13:26:42,321] Trial 47 finished with value: 140390141634.01935 and parameters: {'alpha': 0.5306937653406868, 'epsilon_decay': 0.9903919680740595, 'alpha_decay': 0.0019754557776420698}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.4039014063401935e-06  result:  140390141634.01935


100%|██████████| 10000/10000 [00:06<00:00, 1531.18it/s]
[I 2024-07-18 13:26:48,863] Trial 48 finished with value: 63966252219.62301 and parameters: {'alpha': 0.558086690332744, 'epsilon_decay': 0.9777407089111205, 'alpha_decay': 0.003435344533565424}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  6.396625121962301e-07  result:  63966252219.62301


100%|██████████| 10000/10000 [00:05<00:00, 1683.91it/s]
[I 2024-07-18 13:26:54,811] Trial 49 finished with value: 92243824385.04852 and parameters: {'alpha': 0.5116254915358197, 'epsilon_decay': 0.9862549490293738, 'alpha_decay': 0.00305288330493381}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  9.224382338504853e-07  result:  92243824385.04852


100%|██████████| 10000/10000 [00:06<00:00, 1664.45it/s]
[I 2024-07-18 13:27:00,831] Trial 50 finished with value: 176204617115.2209 and parameters: {'alpha': 0.6490807008805384, 'epsilon_decay': 0.9807932874805776, 'alpha_decay': 0.0057140516388591404}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.7620461611522087e-06  result:  176204617115.2209


100%|██████████| 10000/10000 [00:05<00:00, 1689.68it/s]
[I 2024-07-18 13:27:06,760] Trial 51 finished with value: 81501506473.70497 and parameters: {'alpha': 0.6083588853685159, 'epsilon_decay': 0.9973429396880765, 'alpha_decay': 0.00471576644163873}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  8.150150547370496e-07  result:  81501506473.70497


100%|██████████| 10000/10000 [00:06<00:00, 1565.22it/s]
[I 2024-07-18 13:27:13,159] Trial 52 finished with value: 48635921242.07278 and parameters: {'alpha': 0.5939049841585018, 'epsilon_decay': 0.9937581451489939, 'alpha_decay': 0.0011810016595722875}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  4.863592024207278e-07  result:  48635921242.07278


100%|██████████| 10000/10000 [00:06<00:00, 1570.16it/s]
[I 2024-07-18 13:27:19,539] Trial 53 finished with value: 120697609533.26239 and parameters: {'alpha': 0.57631520331579, 'epsilon_decay': 0.9936174017975946, 'alpha_decay': 0.0011409707194958386}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.206976085332624e-06  result:  120697609533.26239


100%|██████████| 10000/10000 [00:06<00:00, 1660.23it/s]
[I 2024-07-18 13:27:25,573] Trial 54 finished with value: 85966677100.19577 and parameters: {'alpha': 0.5386359698260684, 'epsilon_decay': 0.996688463413406, 'alpha_decay': 0.0022880320409593316}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  8.596667610019577e-07  result:  85966677100.19577


 56%|█████▌    | 5619/10000 [00:03<00:02, 1621.94it/s]
[I 2024-07-18 13:27:29,047] Trial 55 finished with value: 225961706544.2684 and parameters: {'alpha': 0.6210540783009998, 'epsilon_decay': 0.9879273144531306, 'alpha_decay': 0.0015127041096689002}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  5619
[0 1 0 0 1 0 1 0]
mse:  2.259617059823684e-06  result:  225961706544.2684


100%|██████████| 10000/10000 [00:06<00:00, 1642.96it/s]
[I 2024-07-18 13:27:35,142] Trial 56 finished with value: 91406876284.70242 and parameters: {'alpha': 0.6773169108469013, 'epsilon_decay': 0.9909933976003674, 'alpha_decay': 0.0033365815437228804}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  9.140687528470243e-07  result:  91406876284.70242


100%|██████████| 10000/10000 [00:06<00:00, 1647.08it/s]
[I 2024-07-18 13:27:41,226] Trial 57 finished with value: 46345396897.65654 and parameters: {'alpha': 0.594452628523375, 'epsilon_decay': 0.9931574076996488, 'alpha_decay': 0.0038946475211538655}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  4.634539589765654e-07  result:  46345396897.65654


100%|██████████| 10000/10000 [00:06<00:00, 1634.71it/s]
[I 2024-07-18 13:27:47,353] Trial 58 finished with value: 51627293627.973 and parameters: {'alpha': 0.5947118912193743, 'epsilon_decay': 0.9936169101890947, 'alpha_decay': 0.0018917340007024224}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  5.1627292627973e-07  result:  51627293627.973


100%|██████████| 10000/10000 [00:06<00:00, 1522.63it/s]
[I 2024-07-18 13:27:53,929] Trial 59 finished with value: 186492022512.9329 and parameters: {'alpha': 0.6365031433746543, 'epsilon_decay': 0.9714466482626135, 'alpha_decay': 0.002405044485476925}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.8649202151293289e-06  result:  186492022512.9329


100%|██████████| 10000/10000 [00:06<00:00, 1556.52it/s]
[I 2024-07-18 13:28:00,365] Trial 60 finished with value: 86281583934.99544 and parameters: {'alpha': 0.5941419055561518, 'epsilon_decay': 0.997152738174477, 'alpha_decay': 0.0010374986153972405}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  8.628158293499544e-07  result:  86281583934.99544


100%|██████████| 10000/10000 [00:06<00:00, 1646.35it/s]
[I 2024-07-18 13:28:06,450] Trial 61 finished with value: 127466597730.34042 and parameters: {'alpha': 0.5940096035144836, 'epsilon_decay': 0.9936536435813411, 'alpha_decay': 0.0018850712129119779}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.2746659673034042e-06  result:  127466597730.34042


100%|██████████| 10000/10000 [00:06<00:00, 1568.09it/s]
[I 2024-07-18 13:28:12,837] Trial 62 finished with value: 88469038913.31113 and parameters: {'alpha': 0.5573097170867494, 'epsilon_decay': 0.9948306461641442, 'alpha_decay': 0.0014197354314822081}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  8.846903791331112e-07  result:  88469038913.31113


 45%|████▍     | 4454/10000 [00:02<00:03, 1511.04it/s]
[I 2024-07-18 13:28:15,798] Trial 63 finished with value: 356621774043.0741 and parameters: {'alpha': 0.6056387872864979, 'epsilon_decay': 0.953770741979257, 'alpha_decay': 0.0026867817863364236}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  4454
[0 1 0 1 1 0 1 0]
mse:  3.5662177359767407e-06  result:  356621774043.0741


100%|██████████| 10000/10000 [00:06<00:00, 1573.36it/s]
[I 2024-07-18 13:28:22,163] Trial 64 finished with value: 52917534232.378624 and parameters: {'alpha': 0.6560081989786973, 'epsilon_decay': 0.9904216917273684, 'alpha_decay': 0.0018030529052872977}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  5.291753323237862e-07  result:  52917534232.378624


 85%|████████▍ | 8481/10000 [00:05<00:01, 1485.26it/s]
[I 2024-07-18 13:28:27,885] Trial 65 finished with value: 146959052476.72815 and parameters: {'alpha': 0.5865196941743359, 'epsilon_decay': 0.9924466283080486, 'alpha_decay': 0.0038453787899173303}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  8481
[0 0 0 0 1 0 1 0]
mse:  1.4695905162862815e-06  result:  146959052476.72815


100%|██████████| 10000/10000 [00:06<00:00, 1455.40it/s]
[I 2024-07-18 13:28:34,774] Trial 66 finished with value: 132685034029.79555 and parameters: {'alpha': 0.6942707249135087, 'epsilon_decay': 0.9627789346689682, 'alpha_decay': 0.0021585910009188575}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.3268503302979555e-06  result:  132685034029.79555


100%|██████████| 10000/10000 [00:06<00:00, 1540.87it/s]
[I 2024-07-18 13:28:41,275] Trial 67 finished with value: 119809130095.78494 and parameters: {'alpha': 0.6182706875099933, 'epsilon_decay': 0.994097652736674, 'alpha_decay': 0.0096045282097852}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.1980912909578494e-06  result:  119809130095.78494


100%|██████████| 10000/10000 [00:06<00:00, 1520.15it/s]
[I 2024-07-18 13:28:47,862] Trial 68 finished with value: 73548486725.53357 and parameters: {'alpha': 0.5699542967075354, 'epsilon_decay': 0.9960082049815143, 'alpha_decay': 0.0030654061218375155}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  7.354848572553357e-07  result:  73548486725.53357


100%|██████████| 10000/10000 [00:06<00:00, 1490.75it/s]
[I 2024-07-18 13:28:54,582] Trial 69 finished with value: 81088579270.0207 and parameters: {'alpha': 0.6320114825644854, 'epsilon_decay': 0.9985490451714029, 'alpha_decay': 0.003920446690033919}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  8.108857827002071e-07  result:  81088579270.0207


100%|██████████| 10000/10000 [00:06<00:00, 1494.28it/s]
[I 2024-07-18 13:29:01,285] Trial 70 finished with value: 28958637744.39061 and parameters: {'alpha': 0.541532620399185, 'epsilon_decay': 0.9834855869717274, 'alpha_decay': 0.006789779892418453}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  2.895863674439061e-07  result:  28958637744.39061


 33%|███▎      | 3295/10000 [00:02<00:04, 1634.42it/s]
[I 2024-07-18 13:29:03,312] Trial 71 finished with value: 410159607632.33234 and parameters: {'alpha': 0.5375872862896147, 'epsilon_decay': 0.9835984170499809, 'alpha_decay': 0.008963135160979917}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  3295
[0 1 0 1 1 0 1 0]
mse:  4.101596073028323e-06  result:  410159607632.33234


100%|██████████| 10000/10000 [00:06<00:00, 1596.85it/s]
[I 2024-07-18 13:29:09,585] Trial 72 finished with value: 31328077890.181633 and parameters: {'alpha': 0.5592446111860365, 'epsilon_decay': 0.9795935324020161, 'alpha_decay': 0.006370838888358018}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  3.1328076890181633e-07  result:  31328077890.181633


100%|██████████| 10000/10000 [00:06<00:00, 1593.03it/s]
[I 2024-07-18 13:29:15,874] Trial 73 finished with value: 108845796352.17747 and parameters: {'alpha': 0.5599964253351037, 'epsilon_decay': 0.9795884892665325, 'alpha_decay': 0.0069118397114940255}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.0884579535217747e-06  result:  108845796352.17747


 18%|█▊        | 1831/10000 [00:01<00:05, 1576.42it/s]
[I 2024-07-18 13:29:17,046] Trial 74 finished with value: 390496987054.2065 and parameters: {'alpha': 0.5148056048882804, 'epsilon_decay': 0.9762726231313575, 'alpha_decay': 0.005871901530730365}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  1831
[0 1 0 1 1 0 1 0]
mse:  3.904969868711065e-06  result:  390496987054.2065


 50%|█████     | 5038/10000 [00:03<00:03, 1544.70it/s]
[I 2024-07-18 13:29:20,319] Trial 75 finished with value: 440895929835.6616 and parameters: {'alpha': 0.5479326541388344, 'epsilon_decay': 0.9819283627677714, 'alpha_decay': 0.00741969135301492}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  5038
[0 1 0 1 1 0 1 0]
mse:  4.408959293318617e-06  result:  440895929835.6616


100%|██████████| 10000/10000 [00:06<00:00, 1513.54it/s]
[I 2024-07-18 13:29:26,937] Trial 76 finished with value: 134320643861.54564 and parameters: {'alpha': 0.5195341688017907, 'epsilon_decay': 0.9788974659563661, 'alpha_decay': 0.006478602253734449}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.3432064286154564e-06  result:  134320643861.54564


 63%|██████▎   | 6339/10000 [00:04<00:02, 1435.64it/s]
[I 2024-07-18 13:29:31,364] Trial 77 finished with value: 80443570267.68042 and parameters: {'alpha': 0.5353587572941881, 'epsilon_decay': 0.98431436748295, 'alpha_decay': 0.006866215376629345}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  6339
[0 0 0 0 1 0 1 0]
mse:  8.044356963378043e-07  result:  80443570267.68042


100%|██████████| 10000/10000 [00:06<00:00, 1602.23it/s]
[I 2024-07-18 13:29:37,616] Trial 78 finished with value: 27544475216.428143 and parameters: {'alpha': 0.5678177373088867, 'epsilon_decay': 0.9717146450172249, 'alpha_decay': 0.0060498797301029444}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  2.7544474216428144e-07  result:  27544475216.428143


100%|██████████| 10000/10000 [00:06<00:00, 1589.79it/s]
[I 2024-07-18 13:29:43,918] Trial 79 finished with value: 93864138956.80475 and parameters: {'alpha': 0.5829065322041174, 'epsilon_decay': 0.9709755546408233, 'alpha_decay': 0.005355465870278731}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  9.386413795680475e-07  result:  93864138956.80475


100%|██████████| 10000/10000 [00:06<00:00, 1596.10it/s]
[I 2024-07-18 13:29:50,194] Trial 80 finished with value: 81327625130.4133 and parameters: {'alpha': 0.5681109956022031, 'epsilon_decay': 0.960691977277281, 'alpha_decay': 0.006049278059210076}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  8.13276241304133e-07  result:  81327625130.4133


100%|██████████| 10000/10000 [00:06<00:00, 1631.86it/s]
[I 2024-07-18 13:29:56,334] Trial 81 finished with value: 229946580446.82022 and parameters: {'alpha': 0.5541139480725881, 'epsilon_decay': 0.9772310814911285, 'alpha_decay': 0.0064042325787594485}. Best is trial 35 with value: 23113262655.96678.


[0 1 0 0 1 0 1 0]
mse:  2.299465794468202e-06  result:  229946580446.82022


100%|██████████| 10000/10000 [00:06<00:00, 1527.21it/s]
[I 2024-07-18 13:30:02,892] Trial 82 finished with value: 94729387027.6471 and parameters: {'alpha': 0.5765200614319567, 'epsilon_decay': 0.9808795123339827, 'alpha_decay': 0.009467368828203607}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  9.472938602764709e-07  result:  94729387027.6471


 63%|██████▎   | 6339/10000 [00:04<00:02, 1486.87it/s]
[I 2024-07-18 13:30:07,166] Trial 83 finished with value: 75702218405.7168 and parameters: {'alpha': 0.5634668825902492, 'epsilon_decay': 0.9574434616314862, 'alpha_decay': 0.004874835395983691}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  6339
[0 0 0 0 1 0 1 0]
mse:  7.570221777181681e-07  result:  75702218405.7168


100%|██████████| 10000/10000 [00:06<00:00, 1568.23it/s]
[I 2024-07-18 13:30:13,555] Trial 84 finished with value: 89618700771.47359 and parameters: {'alpha': 0.542884361101735, 'epsilon_decay': 0.9673806303348493, 'alpha_decay': 0.0066500022256758065}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  8.961869977147359e-07  result:  89618700771.47359


100%|██████████| 10000/10000 [00:06<00:00, 1475.72it/s]
[I 2024-07-18 13:30:20,344] Trial 85 finished with value: 69366794107.60602 and parameters: {'alpha': 0.6012543119579533, 'epsilon_decay': 0.9745895710516833, 'alpha_decay': 0.007357679455164906}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  6.936679310760602e-07  result:  69366794107.60602


100%|██████████| 10000/10000 [00:06<00:00, 1558.47it/s]
[I 2024-07-18 13:30:26,774] Trial 86 finished with value: 111999612220.67674 and parameters: {'alpha': 0.7551148519603886, 'epsilon_decay': 0.9866449912759622, 'alpha_decay': 0.007758799145927569}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.1199961122067675e-06  result:  111999612220.67674


 82%|████████▏ | 8171/10000 [00:05<00:01, 1497.83it/s]
[I 2024-07-18 13:30:32,240] Trial 87 finished with value: 41013650147.85954 and parameters: {'alpha': 0.8357904004873968, 'epsilon_decay': 0.989628874957166, 'alpha_decay': 0.004600870856494077}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  8171
[0 0 0 0 1 0 1 0]
mse:  4.1013649330759544e-07  result:  41013650147.85954


100%|██████████| 10000/10000 [00:06<00:00, 1556.91it/s]
[I 2024-07-18 13:30:38,674] Trial 88 finished with value: 198691818363.8619 and parameters: {'alpha': 0.8890940173727784, 'epsilon_decay': 0.9921069139822493, 'alpha_decay': 0.004574010198345523}. Best is trial 35 with value: 23113262655.96678.


[0 1 0 0 1 0 1 0]
mse:  1.986918173638619e-06  result:  198691818363.8619


100%|██████████| 10000/10000 [00:06<00:00, 1524.29it/s]
[I 2024-07-18 13:30:45,245] Trial 89 finished with value: 140677009331.93518 and parameters: {'alpha': 0.8616298908477621, 'epsilon_decay': 0.9893659607124681, 'alpha_decay': 0.005177375564865009}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.4067700833193518e-06  result:  140677009331.93518


100%|██████████| 10000/10000 [00:07<00:00, 1405.85it/s]
[I 2024-07-18 13:30:52,369] Trial 90 finished with value: 72320751575.6759 and parameters: {'alpha': 0.8372208148374902, 'epsilon_decay': 0.9521581078375675, 'alpha_decay': 0.004029183655313341}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  7.23207505756759e-07  result:  72320751575.6759


 51%|█████     | 5120/10000 [00:03<00:03, 1492.45it/s]
[I 2024-07-18 13:30:55,810] Trial 91 finished with value: 111999245048.8892 and parameters: {'alpha': 0.8087159194676009, 'epsilon_decay': 0.9974408000135913, 'alpha_decay': 0.0061351223805102195}. Best is trial 35 with value: 23113262655.96678.


convergence_episode =  5120
[0 0 0 0 1 0 1 0]
mse:  1.119992445368892e-06  result:  111999245048.8892


100%|██████████| 10000/10000 [00:06<00:00, 1497.28it/s]
[I 2024-07-18 13:31:02,499] Trial 92 finished with value: 117590154540.40385 and parameters: {'alpha': 0.8794723020719584, 'epsilon_decay': 0.9854456301336342, 'alpha_decay': 0.005500288252789185}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.1759015354040385e-06  result:  117590154540.40385


100%|██████████| 10000/10000 [00:06<00:00, 1550.77it/s]
[I 2024-07-18 13:31:08,958] Trial 93 finished with value: 109824639136.66026 and parameters: {'alpha': 0.7836482575958218, 'epsilon_decay': 0.9873161856894659, 'alpha_decay': 0.0033545179758667915}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  1.0982463813666026e-06  result:  109824639136.66026


100%|██████████| 10000/10000 [00:07<00:00, 1372.42it/s]
[I 2024-07-18 13:31:16,265] Trial 94 finished with value: 225328776905.80374 and parameters: {'alpha': 0.8978604697815196, 'epsilon_decay': 0.9829692613377473, 'alpha_decay': 0.0071695687795045905}. Best is trial 35 with value: 23113262655.96678.


[0 1 0 0 1 0 1 0]
mse:  2.2532877590580373e-06  result:  225328776905.80374


100%|██████████| 10000/10000 [00:06<00:00, 1503.15it/s]
[I 2024-07-18 13:31:22,929] Trial 95 finished with value: 74271109663.85039 and parameters: {'alpha': 0.8381467791307168, 'epsilon_decay': 0.9951867877071312, 'alpha_decay': 0.004877427293982632}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  7.427110866385039e-07  result:  74271109663.85039


100%|██████████| 10000/10000 [00:06<00:00, 1467.65it/s]
[I 2024-07-18 13:31:29,755] Trial 96 finished with value: 42336764918.32807 and parameters: {'alpha': 0.6120686716111784, 'epsilon_decay': 0.9900326082431875, 'alpha_decay': 0.0031503464356771585}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  4.233676391832807e-07  result:  42336764918.32807


100%|██████████| 10000/10000 [00:06<00:00, 1468.72it/s]
[I 2024-07-18 13:31:36,574] Trial 97 finished with value: 91137955620.43338 and parameters: {'alpha': 0.6128526233907652, 'epsilon_decay': 0.9910379472162553, 'alpha_decay': 0.0031709981120409615}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  9.113795462043338e-07  result:  91137955620.43338


100%|██████████| 10000/10000 [00:06<00:00, 1502.20it/s]
[I 2024-07-18 13:31:43,242] Trial 98 finished with value: 247900081840.07642 and parameters: {'alpha': 0.6417223212310632, 'epsilon_decay': 0.9890754631235331, 'alpha_decay': 0.002773395421131439}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 1 1 0 1 0]
mse:  2.479000808400764e-06  result:  247900081840.07642


100%|██████████| 10000/10000 [00:06<00:00, 1433.35it/s]
[I 2024-07-18 13:31:50,228] Trial 99 finished with value: 33876085629.591873 and parameters: {'alpha': 0.6203003820116183, 'epsilon_decay': 0.9927548960917832, 'alpha_decay': 0.0026010696445330455}. Best is trial 35 with value: 23113262655.96678.


[0 0 0 0 1 0 1 0]
mse:  3.3876084629591874e-07  result:  33876085629.591873
Best hyperparameters:  {'alpha': 0.8957711478356251, 'epsilon_decay': 0.9985025614789602, 'alpha_decay': 0.0026566966493291226}


100%|██████████| 10000/10000 [00:06<00:00, 1480.05it/s]



 0 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 10000/10000 [00:06<00:00, 1546.11it/s]



 1 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 10000/10000 [00:06<00:00, 1430.27it/s]



 2 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 10000/10000 [00:06<00:00, 1464.04it/s]



 3 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 10000/10000 [00:06<00:00, 1457.56it/s]



 4 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 10000/10000 [00:06<00:00, 1547.62it/s]



 5 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]


100%|██████████| 10000/10000 [00:06<00:00, 1505.81it/s]



 6 FINAL OPTIMAL POLICY [0 1 0 0 1 0 1 0]


100%|██████████| 10000/10000 [00:06<00:00, 1487.05it/s]



 7 FINAL OPTIMAL POLICY [0 1 0 0 1 0 1 0]


100%|██████████| 10000/10000 [00:06<00:00, 1550.90it/s]



 8 FINAL OPTIMAL POLICY [0 0 0 1 1 0 1 0]


100%|██████████| 10000/10000 [00:06<00:00, 1446.79it/s]


 9 FINAL OPTIMAL POLICY [0 0 0 0 1 0 1 0]





 Run the DQN for the environment 

In [None]:
num_of_episodes = 10000
NN_learning_rate = 0.01
dql = stock_market_trading_DQN()
optimal_network = dql.train_DQN(num_of_episodes,environment,gamma,NN_learning_rate)
dql.test_DQN(10,environment)  

100%|██████████| 10000/10000 [03:01<00:00, 55.20it/s]


# Comparing Optimal Policies Generated from diffrent algorithms

In [None]:
# Phase 1 Optimal Policy
print("Phase 1 Optimal Policy")
print_policy(P_opt1,len(environment))
print("\nOptimal Q = ",Q_opt)
print("================================================================")
# Phase 2 - Tabular Q-Learning Optimal Policy
# print("Phase 2 - Tubular Q-Learning Optimal Policy")
# print(np.argmax(Q_tubular,axis=1))
# print("================================================================")

# Phase 2 - DQN Optimal Policy
print("Phase 2 - DQN Optimal Policy")
Q_NN = dql.print_dqn(optimal_network)
print("================================================================")

# Output difference
# print("\nDifference With Tabular")
# difference,total_error = calculate_difference_and_mse(Q_opt,Q_tubular)
# print(f"difference {difference}\nTotal Error: {total_error}")

# Output difference
print("\nDifference With NN")
print(Q_NN)
difference,total_error = calculate_difference_and_mse(Q_opt,Q_NN)
print(f"difference {difference}\nTotal Error: {total_error}")



Phase 1 Optimal Policy
State 0: Action 0
State 1: Action 1
State 2: Action 1
State 3: Action 1
State 4: Action 1
State 5: Action 0
State 6: Action 1
State 7: Action 1
State 8: Action 0
State 9: Action 1
State 10: Action 0
State 11: Action 1
State 12: Action 1
State 13: Action 1
State 14: Action 1
State 15: Action 0
State 16: Action 1
State 17: Action 1
State 18: Action 1
State 19: Action 1
State 20: Action 1
State 21: Action 1
State 22: Action 1
State 23: Action 1

Optimal Q =  [[0.03031047 0.02815876]
 [0.02638227 0.13133163]
 [0.04941365 0.05844365]
 [0.01009096 0.02041519]
 [0.01933133 0.10841806]
 [0.08369228 0.0101682 ]
 [0.02019804 0.0914925 ]
 [0.06360484 0.13300494]
 [0.06827976 0.03619543]
 [0.04675142 0.10075909]
 [0.01271144 0.01238558]
 [0.05064312 0.10624941]
 [0.00558377 0.09212666]
 [0.07747172 0.12413358]
 [0.00161899 0.12226172]
 [0.07114383 0.03561536]
 [0.05897908 0.07234668]
 [0.04094774 0.10078331]
 [0.02838915 0.076867  ]
 [0.00857494 0.04552349]
 [0.03768553 0.06