## Install libraries

In [1]:
!pip install cmake 'gym[atari]' scipy
!pip install gym[atari]
!pip install autorom[accept-rom-license]
!pip install gym[atari,accept-rom-license]==0.21.0

Collecting cmake
  Downloading cmake-3.24.1-py2.py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.3/23.3 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Collecting ale-py~=0.7.5
  Downloading ale_py-0.7.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cmake, ale-py
Successfully installed ale-py-0.7.5 cmake-3.24.1
[0mCollecting autorom[accept-rom-license]
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.4.2.tar.gz (9.8 kB)
  Installing build dependencies ... [?25l- \ | / - \ | / - \ | / - \ | / - done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) 

## Import the libraries

In [2]:
import matplotlib.pyplot as plt
from IPython.display import clear_output
from time import sleep
import gym
import numpy as np
import random

## Setup the game environment

In [3]:
def get_env(env_name):
  """ This function takes the environment name and return the environment after resetting 
  input: env_name -> string
  return: env -> the environment object
  """
  env = gym.make(env_name)
  env.reset() # reset environment to a new, random state
  return env

## build frames of the game till it's done

In [4]:
def frame_builder(env):
  """  this function take the env and take actions till the game done and return the frames of the game

  Input:  
      env -> environment object
  Output:
      frames -> list of dictionaries as each frame has [{action, frame, reward, state},....]
  """
  env.render()  
  epochs = 0
  penalties, reward = 0, 0
  frames = []
  done = False

  while not done:
    # automatically selects one random action 
      action = env.action_space.sample()
      state, reward, done, info = env.step(action)

      if reward == -10:
          penalties += 1

      # Put each rendered frame into dict for animation
      frames.append({
          'frame': env.render(mode='ansi'),
          'state': state,
          'action': action,
          'reward': reward
          }
      )
      epochs += 1
  return frames

In [5]:
env_name = 'Taxi-v3'
env = get_env(env_name)
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))
frames = frame_builder(env)
frames[0]

Action Space Discrete(6)
State Space Discrete(500)
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|[34;1mB[0m: |
+---------+



{'frame': '+---------+\n|\x1b[35mR\x1b[0m: | : :G|\n| : | : : |\n| : : : : |\n| | : | : |\n|Y| :\x1b[43m \x1b[0m|\x1b[34;1mB\x1b[0m: |\n+---------+\n  (Dropoff)\n',
 'state': 452,
 'action': 5,
 'reward': -10}

In [6]:
env_name = 'FrozenLake-v1'
env = get_env(env_name)
print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))
frames = frame_builder(env)

Action Space Discrete(4)
State Space Discrete(16)

[41mS[0mFFF
FHFH
FFFH
HFFG


## Frame visualization function
visualize the updated frames in the game 

In [7]:
def print_frames(frames):
  """" this fucntion go pass over the frames to show us each frame and it's info
    
  Input: 
      the frames

  print: 
      frame, state, action, and reward
  """
  for i, frame in enumerate(frames):
        # clear_output(wait=True)
        #print(frame['frame'].getvalue())
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)

## Implement Q-Learning

### Build the q-table

In [8]:
def q_table_train(env,alpha = 0.1,gamma = 0.6,epsilon = 0.1, decay_over=False, decay_factor=.1):
  """
  This function is for building the  q-table with trained weights and use the decay over 

  Input :
      alpha (float)-> the learning rate -> scaler
      gamma (float) -> the discount factor -> scaler
      epsilon (float) ->the epsilon-greedy action selection -> scaler

      decay_over -> Boolen varible
      decay_factor -> float to manage the speed of decaying

  Output : 
      q-table (list)
  """
  q_table = np.zeros([env.observation_space.n, env.action_space.n])

  for i in range(1, 100001): #100001
      if decay_over and (i %5000==0):
        alpha, gamma, epsilon = alpha*(1-alpha*decay_factor), gamma*(1-gamma*decay_factor), epsilon*(1-epsilon*decay_factor)
         
      state = env.reset()
      epochs, penalties, reward, = 0, 0, 0
      done = False
      # decay over episode
      while not done:
          if random.uniform(0, 1) < epsilon:
              action = env.action_space.sample() # Explore action space
          else:
              action = np.argmax(q_table[state]) # Exploit learned values

          next_state, reward, done, info = env.step(action) 
          
          old_value = q_table[state, action]
          next_max = np.max(q_table[next_state])
          
          new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
          q_table[state, action] = new_value

          if reward == -10:
              penalties += 1

          state = next_state
          epochs += 1
          
      if i % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i}")
  return q_table

In [9]:
def model_evaluate(env, q_table):
    """ 
    the function take the env object and the q-table list to find the AVG_timesteps, and the AVG_penalities

    Input: 
        env (object type) 
        q_table (list)

    Output:
        frames (list)-> list of frames
        AVG_timesteps (float)-> the average time steps
        AVG_penalities (float)-> the average penalites
    """
    frames = []
    total_epochs, total_penalties = 0, 0
    episodes = 100
    for _ in range(episodes):
        state = env.reset()
        epochs, penalties, reward = 0, 0, 0
        done = False

        while not done:
            action = np.argmax(q_table[state])
            state, reward, done, info = env.step(action)
            if reward == -10:
                penalties += 1
            frames.append({
                'frame': env.render(mode='ansi'),
                'state': state,
                'action': action,
                'reward': reward
                }
            )
            epochs += 1

        total_penalties += penalties
        total_epochs += epochs
    
    AVG_timesteps = total_epochs / episodes
    AVG_penalities = total_penalties / episodes
    return frames, AVG_timesteps, AVG_penalities

## Hyperparameter

In [10]:
alpha = 0.6
gamma = 0.9
epsilon = 0.1

## Q-table

In [11]:
env_name = 'Taxi-v3'
env = get_env(env_name)
q_table=q_table_train(env,alpha =alpha,gamma = gamma,epsilon = epsilon)
frames, AVG_timesteps, AVG_penalities= model_evaluate(env, q_table)
print(AVG_timesteps, AVG_penalities)
print(q_table)

Episode: 100000
13.45 0.0
[[ 0.          0.          0.          0.          0.          0.        ]
 [-0.58568212  0.4603532  -0.58568212  0.4603532   1.62261467 -8.5396468 ]
 [ 4.348907    5.94323     4.348907    5.94323     7.7147     -3.05677   ]
 ...
 [ 6.18893021  9.683       4.16647773  5.94323    -2.84744232 -5.44346971]
 [ 1.26584249  1.82864657  1.47714114  2.9140163  -7.36422611 -7.3983982 ]
 [14.3        11.87       14.3        17.          5.3         5.3       ]]


In [12]:
env_name = 'FrozenLake-v1'
env = get_env(env_name)
q_table=q_table_train(env,alpha = alpha,gamma = gamma,epsilon = epsilon)
frames, AVG_timesteps, AVG_penalities= model_evaluate(env, q_table)
print(AVG_timesteps, AVG_penalities)
print(q_table)

Episode: 100000
21.73 0.0
[[5.30275837e-02 5.51533143e-02 1.04059625e-01 5.49307576e-02]
 [4.67373232e-02 3.14566715e-02 4.09543270e-02 5.56550216e-02]
 [4.79558628e-02 7.56489488e-02 4.35598619e-02 4.79610740e-02]
 [5.25942579e-02 1.65502396e-02 3.61400780e-03 5.12265364e-02]
 [1.76137806e-01 6.33375497e-02 2.90541563e-02 8.50380903e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.11601856e-02 9.89641732e-03 4.81614067e-03 4.28259370e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.93393793e-02 3.05311867e-02 2.01252009e-02 2.52998530e-01]
 [1.82366599e-02 3.85692366e-01 3.93998739e-02 2.31683596e-01]
 [1.62414697e-01 2.00000404e-01 4.56866874e-02 2.19267989e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.19920636e-01 8.08887423e-02 4.39904713e-01 1.93267375e-01]
 [4.55148622e-01 7.59445236e-01 4.31351503e-01 5.06446052e-01]
 [0.00000000e+00 0.00000000e+

## Train model function

In [13]:
def train_model(env_name="Taxi-v3", alpha_para = 0.1, gamma_para = 0.6, epsilon_para = 0.1,decay_over=False, decay_factor=.1):
  """ the function work to train the model using the parameters plus gaving an option to apply the decay over episodes with a decay factor

  Input: 
    env_name (String): the game name
    alpha_para (float), gamma_para (float), epsilon_para (float)

    decay_over (boolean) -> to apply the decay technique or not
    decay_factor (float): due to the decay equation we need the decay_factor, the Equation (parameter*(1-parameter*decay_factor) )
    
  Output:
    frames (list): list of frames
    AVG_timesteps (float)-> the average time steps
    AVG_penalities (float)-> the average penalites
    
  """
  env = get_env(env_name)
  # frames= frame_builder(env)   
  q_table=q_table_train(env,alpha = alpha_para,gamma = gamma_para,epsilon = epsilon_para,decay_over=decay_over,decay_factor=decay_factor)
  frames, AVG_timesteps, AVG_penalities = model_evaluate(env, q_table)
    
  return frames, AVG_timesteps, AVG_penalities

## 2) Tune alpha, gamma, and/or epsilon using a decay over episodes

In [14]:
env_name = 'Taxi-v3'
frames, AVG_timesteps, AVG_penalities = train_model(env_name, alpha_para = 0.1, gamma_para = 0.6, epsilon_para = 0.9,decay_over=True,decay_factor=.1)
print(f"Average timesteps per episode: {AVG_timesteps}")
print(f"Average penalties per episode: {AVG_penalities}")

Episode: 100000
Average timesteps per episode: 25.71
Average penalties per episode: 0.0


In [15]:
print_frames(frames[-5:])

+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)

Timestep: 1
State: 14
Action: 3
Reward: -1
+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)

Timestep: 2
State: 14
Action: 3
Reward: -1
+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)

Timestep: 3
State: 14
Action: 3
Reward: -1
+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)

Timestep: 4
State: 14
Action: 3
Reward: -1
+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)

Timestep: 5
State: 14
Action: 3
Reward: -1


In [16]:
env_name = 'FrozenLake-v1'
frames,AVG_timesteps, AVG_penalities = train_model(env_name, alpha_para = 0.1, gamma_para = 0.6, epsilon_para = 0.9,decay_over=True,decay_factor=.1)
print(f"Average timesteps per episode: {AVG_timesteps}")
print(f"Average penalties per episode: {AVG_penalities}")

Episode: 100000
Average timesteps per episode: 5.85
Average penalties per episode: 0.0


In [17]:
print_frames(frames[-5:])

  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG

Timestep: 1
State: 13
Action: 1
Reward: 0.0
  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG

Timestep: 2
State: 12
Action: 1
Reward: 0.0
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG

Timestep: 3
State: 1
Action: 1
Reward: 0.0
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG

Timestep: 4
State: 1
Action: 2
Reward: 0.0
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG

Timestep: 5
State: 5
Action: 2
Reward: 0.0


## 3) Implement a grid search to discover the best hyperparameters

In [18]:
def grid_search(env_name="Taxi-v3",parameters={'alpha':[0.9],'gamma':[0.9],'epsilon':[.9]},decay_over=False,decay_factor=.1):
  """ 
  This function try to find the best compination of parmteres with respect to the lowest penalty with minimum timesteps

  Input: 
      env_name (string) -> Game name
      parameters (dict) -> Dictionary of lists for each parameter; Example:{'alpha':[0.9],'gamma':[0.9],'epsilon':[.9]}

      decay_over (boolean) -> to apply the decay technique or not
      decay_factor (float) -> due to the decay equation we need the decay_factor, the Equation (parameter*(1-parameter*decay_factor) )

  Output:
      best_params (dict) -> with the best paramters
      best_AVGtime (float) -> the best avarage time
      best_AVGpenalties (float) -> the least penalty value
      best_frame (list)

  """
  best_AVGtime , best_AVGpenalties= 999999,999999
  best_frame =None
  best_params={}

  for alpha in parameters['alpha']:
        for gamma in parameters['gamma']:
            for epsilon in parameters['epsilon']:
              frames, AVG_timesteps, AVG_penalities = train_model(env_name, alpha_para = alpha, gamma_para = gamma, epsilon_para = epsilon,decay_over=decay_over,decay_factor=decay_factor)
              if AVG_penalities <= best_AVGpenalties:
                    if AVG_timesteps <= best_AVGtime :
                      best_AVGtime ,best_AVGpenalties = AVG_timesteps, AVG_penalities
                      best_params = {'alpha':alpha,'gamma':gamma,'epsilon':epsilon}
                      best_frame = frames

  return best_params, best_AVGtime ,best_AVGpenalties, best_frame

In [19]:
env_name = "FrozenLake-v1"
params = {'alpha':[0.9,0.6,0.3],'gamma':[0.9,0.6,0.3],'epsilon':[0.9,0.6,0.3]} #[0.9,0.6,0.3]
best_params, best_AVGtime ,best_AVGpenalties, best_frame = grid_search(env_name=env_name,parameters=params,decay_over=False,decay_factor=.1)


Episode: 100000


In [20]:
print('Best_parameters:', best_params)
print('Average timesteps per episode:', best_AVGtime)
print('Average penalties per episode:', best_AVGpenalties)

Best_parameters: {'alpha': 0.3, 'gamma': 0.3, 'epsilon': 0.6}
Average timesteps per episode: 4.87
Average penalties per episode: 0.0


In [21]:
print_frames(best_frame[-5:])

  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG

Timestep: 1
State: 2
Action: 2
Reward: 0.0
  (Left)
S[41mF[0mFF
FHFH
FFFH
HFFG

Timestep: 2
State: 1
Action: 0
Reward: 0.0
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG

Timestep: 3
State: 1
Action: 2
Reward: 0.0
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG

Timestep: 4
State: 1
Action: 2
Reward: 0.0
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG

Timestep: 5
State: 5
Action: 2
Reward: 0.0


In [22]:
env_name = "Taxi-v3"
params = {'alpha':[0.9,0.6,0.3],'gamma':[0.9,0.6,0.3],'epsilon':[0.9,0.6,0.3]}
best_params, best_AVGtime ,best_AVGpenalties, best_frame = grid_search(env_name=env_name,parameters=params,decay_over=False,decay_factor=.1)

Episode: 100000


In [23]:
print('Best_parameters:', best_params)
print('Average timesteps per episode:', best_AVGtime)
print('Average penalties per episode:', best_AVGpenalties)

Best_parameters: {'alpha': 0.9, 'gamma': 0.3, 'epsilon': 0.9}
Average timesteps per episode: 12.36
Average penalties per episode: 0.0


In [24]:
print_frames(best_frame[-5:])

+---------+
|[35mR[0m: | : :G|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)

Timestep: 1
State: 236
Action: 3
Reward: -1
+---------+
|[35mR[0m: | : :G|
| :[42m_[0m| : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)

Timestep: 2
State: 136
Action: 1
Reward: -1
+---------+
|[35mR[0m:[42m_[0m| : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)

Timestep: 3
State: 36
Action: 1
Reward: -1
+---------+
|[35m[42mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)

Timestep: 4
State: 16
Action: 3
Reward: -1
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 5
State: 0
Action: 5
Reward: 20
