In [2]:
import gymnasium as gym 
import os
import numpy as np
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statistics 

In [5]:
class Qlearning:
    def __init__(self, environment_name, episodes, epsilon = 0.2, alpha=0.2, gamma=0.95):
        self.env = gym.make(environment_name)
        self.episodes = episodes
        self.episode_data = 500
        self.param = alpha
        self.ep_rewards_table = {'ep': [], 'avg_'+str(self.param): [], 'min_'+str(self.param): [], 'max_'+str(self.param): [], 'std_'+str(self.param): [], 'mid_'+str(self.param): []}
        
        # Initialize Q
        self.space_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.upper_bounds = [self.env.observation_space.high[0], 0.5, self.env.observation_space.high[2], math.radians(50) / 1.]
        self.lower_bounds = [self.env.observation_space.low[0], -0.5, self.env.observation_space.low[2], -math.radians(50) / 1.]
        self.number_bins = 50
        self.Q = np.random.randn(self.number_bins, self.number_bins, self.number_bins, self.number_bins, self.action_size)
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma

    def select_e_greedy(self, state, epsilon):
        if np.random.rand() < epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.Q[state])

    def discretize_state(self, observation):
        pos_index = np.argmin(np.abs(np.linspace(self.lower_bounds[0], self.upper_bounds[0], num=self.number_bins).tolist() - observation[0]))
        vel_index = np.argmin(np.abs(np.linspace(self.lower_bounds[1], self.upper_bounds[1], num=self.number_bins).tolist() - observation[1]))
        ang_index = np.argmin(np.abs(np.linspace(self.lower_bounds[2], self.upper_bounds[2], num=self.number_bins).tolist() - observation[2]))
        ang_vel_index = np.argmin(np.abs(np.linspace(self.lower_bounds[3], self.upper_bounds[3], num=self.number_bins).tolist() - observation[3]))
        return pos_index, vel_index, ang_index, ang_vel_index

    def test_q(self):
        state, info = self.env.reset()
        score = 0 
        epsilon = 0.0
        d_state = self.discretize_state(state)
        action = self.select_e_greedy(d_state, epsilon)
        terminated = False
        truncated = False
        while not (terminated or truncated or score > 500):
            state_prime, reward, terminated, truncated, info = self.env.step(action)
            d_state_prime = self.discretize_state(state_prime)
            action_prime = self.select_e_greedy(d_state_prime, epsilon)
            d_state = d_state_prime
            action = action_prime
            score += reward
        self.env.close()
        return score

    def train(self):
        ep_rewards = []
        ep_rewards_t = []

        for episode in range(1, self.episodes + 1):
            state, info = self.env.reset()
            score = 0 
            d_state = self.discretize_state(state)
            #action = self.select_e_greedy(d_state, self.epsilon)
            terminated = False
            truncated = False

            while not (terminated or truncated or score > 500):
                
                action = self.select_e_greedy(d_state, self.epsilon)

                state_prime, reward, terminated, truncated, info = self.env.step(action)
                d_state_prime = self.discretize_state(state_prime)        
            
                self.Q[d_state+(action,)] += self.alpha * (reward + self.gamma * np.max(self.Q[d_state_prime]) - self.Q[d_state+(action,)])
                
                d_state = d_state_prime
                score += reward

            t_score = self.test_q()
            ep_rewards.append(score)
            ep_rewards_t.append(t_score)

            if not episode % self.episode_data:
                avg_reward = sum(ep_rewards_t[-self.episode_data:]) / len(ep_rewards_t[-self.episode_data:])
                self.ep_rewards_table['ep'].append(episode)
                self.ep_rewards_table['avg_'+str(self.param)].append(avg_reward)
                self.ep_rewards_table['min_'+str(self.param)].append(min(ep_rewards_t[-self.episode_data:]))
                self.ep_rewards_table['max_'+str(self.param)].append(max(ep_rewards_t[-self.episode_data:]))
                self.ep_rewards_table['std_'+str(self.param)].append(np.std(ep_rewards_t[-self.episode_data:]))
                self.ep_rewards_table['mid_'+str(self.param)].append(statistics.median(ep_rewards_t[-self.episode_data:]))
                
                print(f"Episode:{episode} avg:{avg_reward} min:{min(ep_rewards_t[-self.episode_data:])} max:{max(ep_rewards_t[-self.episode_data:])} std:{np.std(ep_rewards_t[-self.episode_data:])}")

        self.env.close()

In [6]:
alphas = [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4]
#alphas = [0.9, 0.8]
episodes = 50001
episode_data = 500
environment_name = "CartPole-v1"
agents_ql_alpha = []

for alpha in alphas:
    print(f"Alpha = {alpha}")
    agent = Qlearning(environment_name, episodes, epsilon = 0.25, alpha= alpha, gamma= 0.95)
    agent.train()
    agents_ql_alpha.append(agent)

Alpha = 1
Episode:500 avg:21.456 min:8.0 max:172.0 std:15.058952951649726
Episode:1000 avg:19.434 min:8.0 max:108.0 std:11.322086556814517
Episode:1500 avg:18.26 min:8.0 max:114.0 std:10.851193482746494
Episode:2000 avg:20.814 min:8.0 max:147.0 std:14.574203374455841
Episode:2500 avg:26.778 min:8.0 max:465.0 std:30.59837766941248
Episode:3000 avg:40.644 min:8.0 max:500.0 std:53.19328213223922
Episode:3500 avg:56.624 min:8.0 max:500.0 std:80.35784108598239
Episode:4000 avg:46.324 min:8.0 max:377.0 std:51.969943467354284
Episode:4500 avg:55.178 min:9.0 max:500.0 std:75.07024920699278
Episode:5000 avg:89.12 min:8.0 max:500.0 std:121.91451759327107
Episode:5500 avg:67.96 min:8.0 max:500.0 std:92.9232930970486
Episode:6000 avg:73.866 min:9.0 max:500.0 std:90.11308475465702
Episode:6500 avg:76.942 min:9.0 max:500.0 std:94.38901756030731
Episode:7000 avg:92.072 min:9.0 max:500.0 std:101.72963587863666
Episode:7500 avg:102.512 min:10.0 max:500.0 std:110.3827788017678
Episode:8000 avg:91.05 min

In [None]:
plt.figure(figsize=(10, 6))


for agent in agents_ql_alpha:
    #print(agent)
    if agent.param > 0.3:
        df = pd.DataFrame(agent.ep_rewards_table)


        # Plot the mean rewards as a main line
        sns.lineplot(data=df, x='ep', y='avg_'+str(agent.param), label='Alpha '+str(agent.param))

        # Fill the area between (mean - std) and (mean + std) as the interval
        #plt.fill_between(df['ep'], df['avg_'+str(agent.param)] - df['std_'+str(agent.param)], df['avg_'+str(agent.param)] + df['std_'+str(agent.param)], alpha=0.3)

# Customize the plot
plt.title('Mean Reward with Confidence Interval (Window Size = 500 Episodes)')
plt.xlabel('Episode')
plt.ylabel('Mean Reward')
plt.legend()

# Show the plot
plt.show()