In [1]:
import gym
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle

#for text processing
import spacy
import re
import pandas as pd
env = gym.make("Taxi-v3").env

env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



#### There are 4 locations (labeled by different letters), and our job is to pick up the passenger at one location and drop him off at another. We receive +20 points for a successful drop-off and lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions."

### Fetching Origing, Destination, and Time of Pickup from the sms data 

In [78]:
def fetch_pickup_drop(text):
    origin = ''
    destination = ''
    time_of_pickup = ''
    parts_of_sentence = re.split(r' to ', text)
    for part in parts_of_sentence:
        for loc in loc_dict:
            if((loc in part) and ('from' in part)):
               origin = loc
            elif((loc in part) and ('from' not in part)):
                destination = loc
    
    return [origin, destination, time_of_pickup]


In [2]:
env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


## Summing up the Q-Learning Process
Breaking it down into steps, we get

Initialize the Q-table by all zeros.

Start exploring actions: 

For each state, select any one among all possible actions for the current state (S).

Travel to the next state (S') as a result of that action (a).

For all possible actions from the state (S') select the one with the highest Q-value.

Update Q-table values using the equation.

Set the next state as the current state.

If goal state is reached, then end and repeat the process.


## Exploiting learned values
After enough random exploration of actions, the Q-values tend to converge serving our agent as an action-value function which it can exploit to pick the most optimal action from a given state.

There's a tradeoff between exploration (choosing a random action) and exploitation (choosing actions based on already learned Q-values). We want to prevent the action from always taking the same route, and possibly overfitting, so we'll be introducing another parameter called ϵ "epsilon" to cater to this during training.

Instead of just selecting the best learned Q-value action, we'll sometimes favor exploring the action space further. Lower epsilon value results in episodes with more penalties (on average) which is obvious because we are exploring and making random decisions.

In [79]:
#Initialize Q_table
import numpy as np

q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [80]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

##Write your code here
for i in range(1, 100001):
    state = env.reset()
    
    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()   # Explore action space
        else:
            action = np.argmax(q_table[state])   # Exploit learned values
            
        next_state, reward, done, info = env.step(action)
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value
        
        if reward == -10:
            penalties += 1
        
        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")
        

np.save("./q_table.npy", q_table)

Episode: 100000
Wall time: 1min 32s


In [83]:
#Load trained q_table for evaluation

q_table = np.load("./q_table.npy")

In [84]:
def create_loc_dict(city_df):
    loc_dict = {}
    ## Create dictionary example, loc_dict['dwarka sector 23] = 0
    for i in range(len(city_df)):
        loc_dict[city_df.loc[i, 'location']] = city_df.loc[i, 'mapping']
        
    return loc_dict

In [85]:
def check_pick_up_drop_correction(pick_up, drop, line_num):
    orig_df = pd.read_csv("./org_df.csv")
    original_origin = orig_df.loc[line_num, 'origin']
    original_destination = orig_df.loc[line_num, 'dest']

    if original_origin == pick_up and original_destination == drop:
        return True
    else:
        return False

In [86]:
"""Evaluate agent's performance after Q-learning"""

# 1) We need to take text drom "sms.txt" and fetch pickup and drop from it.
# 2) Generate the random state from an enviroment and change the pick up and drop as the fetched one
# 3) Evaluate you q_table performance on all the texts given in sms.txt.
# 4) Have a check if the fetched pickup, drop is not matching with original pickup, drop using orig.csv
# 5) If fetched pickup or/and drop does not match with the original, add penality and reward -10
# 6) Calculate the Total reward, penalities, Wrong pickup/drop predicted and Average time steps per episode.

total_epochs, total_penalties, total_reward, wrong_predictions = 0, 0, 0, 0


count = 0
time_list = []
f = open("./sms.txt", "r")
num_of_lines = 1000
city = pd.read_csv("./city.csv")
loc_dict = create_loc_dict(city)
line_num = 0
for line in f:
    state = env.reset()
    
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    fetched_origin, fetched_destination, fetched_pickup_time = fetch_pickup_drop(line)
    #print("fetched_origin: " + fetched_origin + ", fetched_destination: " + fetched_destination)
    isCorrect = check_pick_up_drop_correction(fetched_origin, fetched_destination, line_num)
    
    if(isCorrect):
        origin_mapping = loc_dict[fetched_origin.strip()]
        destination_mapping = loc_dict[fetched_destination.strip()]
        state = env.encode(3, 1, origin_mapping, destination_mapping)
        env.s = state
        
    
        while not done:
            action = np.argmax(q_table[state])
            
            state, reward, done, info = env.step(action)
        
            if reward == -10:
                penalties += 1
            
            epochs += 1
    else:
        wrong_predictions += 1
        reward = -10
        penalties += 1
        
    total_epochs += epochs
    total_penalties += penalties
    
    total_reward += reward
 ##Write your code here
    line_num += 1



print(f"Results after {num_of_lines} episodes:")
print(f"Average timesteps per episode: {total_epochs / num_of_lines}")
print(f"Average penalties per episode: {total_penalties / num_of_lines}")
print(f"Total number of wrong predictions", wrong_predictions)
print("Total Reward is", total_reward)

FileNotFoundError: [Errno 2] No such file or directory: './sms.txt'