In [2]:
!pip install gymnasium
!pip install swig     #This solves the errori in the installation of gymnasium[box2d]
!pip install gymnasium[box2d]
!pip install gym-notebook-wrapper   #This installs Gym-Notebook-Wrapper, that provides small wrappers for running and rendering OpenAI Gym

#To solve the xvfb missing file problem
!sudo apt-get install xvfb
!pip install xvfbwrapper

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting swig
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.1.1
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (

In [7]:
!git clone https://github.com/ebisunti/Prova          #It clones my github repository
%cd Prova


import gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch import Tensor
import matplotlib.pyplot as plt
from collections import deque
import config
from model import Model


episode_reward = 0 # for info and debugging
buffer = deque([],config.BUFFER_SIZE) # Past experience arranged as a queue
epsilon = config.MAX_EPSILON
alpha = config.ALPHA
decay = config.EPSILON_DECAY
#to plot graphics
cum_reward_table = np.zeros(config.NUM_EPISODES)
cum_reward_nn = np.zeros(config.NUM_EPISODES)

model = Model().to(config.DEVICE)
#print(model)

target_model = Model().to(config.DEVICE)

optimizer = optim.Adam(model.parameters(), lr=config.LR)
optimizer_target = optim.Adam(target_model.parameters(), lr=config.LR)

huber_loss=nn.HuberLoss(delta=1.0)

# Define the policy to know how chose the action
#EPSGREEDY POLICY  q table
def select_action(state, epsilon):
    rv = random.uniform(0, 1)
    if rv < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])

#EPSGREEDY POLICY  neural network
def select_action_nn(state, epsilon):
    rv = random.uniform(0, 1)
    if rv < epsilon:
        return env.action_space.sample()
    else:
        prediction = model(torch.from_numpy(state)).detach().numpy
        action=np.argmax(prediction) # Select action with max predicted Q-value
        return action


## update the epsilon value along the iteration until converges to MIN_EPSILON
def update_epsilon(epsilon):
    epsilon -= epsilon/100 # reduce epsilon by 1/100
    if epsilon<=config.MIN_EPSILON:
        return config.MIN_EPSILON
    else:
        return epsilon

## update the epsilon every episode by epsilon decay variable
def update_epsilon_nn(epsilon):
    epsilon *= decay
    if epsilon<=config.MIN_EPSILON:
        return config.MIN_EPSILON
    else:
        return epsilon


env = gym.make("LunarLander-v2", render_mode="human")

if(config.use_qtable):
    # define the Q table
    #Q = np.zeros([27684, env.action_space.n]) # little discretization
    Q = np.zeros([19051200, env.action_space.n]) #big discretization

###see the limit of the values of the box observation space
#print(env.observation_space.high)
#print(env.observation_space.low)

###see in more detail the action space and the observation space
#print(env.action_space)
#print(env.observation_space)


if(config.use_qtable): # use a q table to reach the goal
    for i in range(config.NUM_EPISODES):
        observation, info = env.reset()# use seed to have same initial state
        #state = config.discretize(observation)
        state = config.big_discretize(observation)
        for j in range(500):
            action = select_action(state,epsilon)
            obv, reward, done, truncated, info = env.step(action)
            #next_state = config.discretize(obv)
            next_state = config.big_discretize(obv)

            next_max = np.max(Q[next_state])

            Q[state,action] += alpha*(reward+config.GAMMA*next_max-Q[state,action])
            state = next_state

            episode_reward += reward

            if done or truncated:
                break

        print("episode: ", i)
        print("episode cumulative reward : ", episode_reward)
        print("epsilon: ",epsilon)
        epsilon = update_epsilon(epsilon)
        cum_reward_table[i]=episode_reward
        episode_reward = 0 #reset the total reward each episode

    #save the q table for testing
    #np.savetxt('q_table.csv', Q, delimiter=','fmt='%f18')
    #np.savetxt('q_table_little_discretization2000.csv', Q, delimiter=',') # full precision
    np.savetxt('q_table_big_discretization1000.csv', Q, delimiter=',') # full precision

else: #use a nn to approximate the q function
    for i in range(config.NUM_EPISODES):
        state = env.reset()
        for j in range(500):
            action = select_action_nn(state,epsilon)
            next_state, reward, done, truncated = env.step(action)

            episode_reward += reward

            #Remove the oldest item if the queue is full so can add new one
            if len(buffer)>=config.BUFFER_SIZE:
                buffer.popleft() # dequeue oldest item


            buffer.append([*state,action,reward,*next_state,done])

            state = next_state # update current state

            if done or truncated:

                # train NN every 4 episodes and if buffer has at least BATCH_SIZE tuple
                if len(buffer) >= config.BATCH_SIZE and ((i+1) % 4 == 0):
                    batch = random.sample(buffer, config.BATCH_SIZE)
                    dataset = np.array(batch)
                    states = torch.from_numpy((dataset[:,:8]).astype('float32'))
                    actions = torch.from_numpy(dataset[:,8:9].astype('int64'))
                    rewards = torch.from_numpy(dataset[:,9:10].astype('float32'))
                    next_states = torch.from_numpy((dataset[:,10:18]).astype('float32'))
                    dones = torch.from_numpy(dataset[:,18:19].astype('float32'))

                    #-------vanilla dqn------------#

                    """# Find next best action so can compute the next reward for the target
                    #predictions_next = target_model(next_states).detach().max(1)[0].unsqueeze(1)
                    #next_actions=np.argmax(predictions_next) # Select action with max Q-value

                    #Compute corresponding (predicted) reward of next state
                    #next_rewards = predictions_next[next_actions]
                    next_rewards = target_model(next_states).detach().max(1)[0].unsqueeze(1)
                    #-------------------------------#"""
                    #---------double dqn-------------#

                    # Find next best action using model network
                    predictions_next = model(next_states).detach().numpy()
                    next_actions = np.argmax(predictions_next,axis=1) # Select action with max Q-value
                    next_actions =  next_actions[..., np.newaxis]

                    #evaluate Q(s',a') founded by model using the target network
                    next_rewards = target_model(next_states).gather(1, torch.from_numpy(next_actions))
                    #next_rewards = torch.from_numpy(evaluations[next_actions])

                    #-------------------------------#

                    targets = rewards + config.GAMMA_NN*next_rewards*(1-dones)

                    #compute the predicted value of the model(output)
                    output=model(states).gather(1, actions)
                    #compute the huber loss
                    loss = huber_loss(output, targets)
                    #Train network
                    optimizer.zero_grad()#clear existing gradient
                    loss.backward() #backpropagate the error
                    optimizer.step() # update weights
                    #save the weight of the network
                    config.save_model(model,optimizer,i+1)
                    print("Save weigths in: "+ config.CHECKPOINT)
                    epsilon = update_epsilon_nn(epsilon)

                #update weights of target network every 10 episodes
                if  (i+1) % config.TARGET_FREQ_UPDATE == 0:
                    print("Target network updated")
                    config.load_model(config.CHECKPOINT,target_model,optimizer_target)

                print("episode ", i)
                print("episode cumulative reward: ", episode_reward)
                print("current epsilon: ", epsilon)
                print("#---------------------------------------------#")
                break

        cum_reward_nn[i]=episode_reward
        episode_reward = 0



env.close()

###### after saved results of the test plot it
#first create an array with natural number to represent the episodes
ep = np.zeros(config.NUM_EPISODES,int)
for i in range (config.NUM_EPISODES):
    ep[i] = i

fig = plt.figure(figsize=(20, 10))
#fig, ax = plt.subplots()  # Create a figure containing a single axes.
ax = fig.add_subplot(111)
#ax.plot(ep, cum_reward_table, label="Q table policy")  # Plot some data on the axes.
ax.plot(ep, cum_reward_table, label="Q table policy")  # Plot some data on the axes.
ax.set_xlabel('EPISODES', fontsize=14)  # Add an x-label to the axes.
ax.set_ylabel('CUMULATIVE REWARD', fontsize=14)  # Add a y-label to the axes.
ax.set_title("TRAINING AGENT", fontsize=18)  # Add a title to the axes.
ax.legend(loc=(0.3, -0.1))  # Add a legend.

#specify axis tick step sizes
plt.xticks(np.arange(0,1000,50))# np.arange(min,max, step)

#-----save image of the plot--------#
plt.savefig('plots/Q_table_big1000.png')

Cloning into 'Prova'...
remote: Enumerating objects: 26, done.[K
remote: Counting objects:   3% (1/26)[Kremote: Counting objects:   7% (2/26)[Kremote: Counting objects:  11% (3/26)[Kremote: Counting objects:  15% (4/26)[Kremote: Counting objects:  19% (5/26)[Kremote: Counting objects:  23% (6/26)[Kremote: Counting objects:  26% (7/26)[Kremote: Counting objects:  30% (8/26)[Kremote: Counting objects:  34% (9/26)[Kremote: Counting objects:  38% (10/26)[Kremote: Counting objects:  42% (11/26)[Kremote: Counting objects:  46% (12/26)[Kremote: Counting objects:  50% (13/26)[Kremote: Counting objects:  53% (14/26)[Kremote: Counting objects:  57% (15/26)[Kremote: Counting objects:  61% (16/26)[Kremote: Counting objects:  65% (17/26)[Kremote: Counting objects:  69% (18/26)[Kremote: Counting objects:  73% (19/26)[Kremote: Counting objects:  76% (20/26)[Kremote: Counting objects:  80% (21/26)[Kremote: Counting objects:  84% (22/26)[Kremote: Counting ob

RuntimeError: ignored