In [1]:
import gym
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle

#for text processing
import spacy
import re
import pandas as pd
env = gym.make("Taxi-v3").env

env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



#### There are 4 locations (labeled by different letters), and our job is to pick up the passenger at one location and drop him off at another. We receive +20 points for a successful drop-off and lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions."

### Fetching Origing, Destination, and Time of Pickup from the sms data 

#  As NLTK/Spacy depend on parts of speech tagging to identify location; the location names which don’t start with capital letter or which contains both hindi and english letters in it are not identified. </font>

## For Example - (Dwarka/Hauz Khaz are Hindi/Urdu words) . These type of words are not trained under the Spacy NLP Model.</font>

## To deal with this situation I tried to train a Blank Spacy NER Recognizer (CustomSpacy.ipynb) that will recognize the word like Dwarka,Hauz-Khaz as LOCATION.
<a href="./CustomSpacy.ipynb" >You can find the .ipynb file here </a> 

# * Importing Custom Spacy Model as nlp2 

In [2]:
from pathlib import Path
output_dir=Path(".\Models")
nlp2 = spacy.load(output_dir)

In [3]:
docx = nlp2('I want to go to dwarka sector 23 from airport leaving at 7 PM')
print("Entities", [(ent.text, ent.label_) for ent in docx.ents])

Entities [('dwarka sector 23', 'LOCATION'), ('airport leaving', 'LOCATION'), ('7 PM', 'TIME')]


In [4]:
def fetch_pickup_drop(text):
    
    #Write your code here
    #Extracting Entities LOCATION & TIME and doing little bit of manual cleaning
    x=[]
    orign,destination,time_of_pickup='','',''
    time_of_pickup=''
    #3text=text.lower()
    text=text.replace("\n", " ")
    docx = nlp2(text)
    for text1 in docx.ents:
        
        if text1.label_=='LOCATION':
            text1 = text1.text
            if 'airport' in text1:
                text_len = len(text1)
                if((text_len)>7):
                    text1 = text1[0:7]
            
            if (re.search('from\s'+text1+'',text) or re.search(''+text1+'\sto',text)):
                origin=text1
                
            else:
                pass
                
            if (re.search('to\s'+text1+'',text) or re.search('for\s'+text1+'',text)):
                destination=text1
                
            else:
                pass
                
            #x.append(text)
        else:
            #print(time_of_pickup)
            time_of_pickup = text1
    return [origin, destination, time_of_pickup]

In [146]:
fetch_pickup_drop('Kindly book a cab for me at 1 PM from hauz khaas to dwarka sector 23')

['hauz khaas', 'dwarka sector 23', 1 PM]

In [147]:
env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


## Summing up the Q-Learning Process
Breaking it down into steps, we get

Initialize the Q-table by all zeros.

Start exploring actions: 

For each state, select any one among all possible actions for the current state (S).

Travel to the next state (S') as a result of that action (a).

For all possible actions from the state (S') select the one with the highest Q-value.

Update Q-table values using the equation.

Set the next state as the current state.

If goal state is reached, then end and repeat the process.


## Exploiting learned values
After enough random exploration of actions, the Q-values tend to converge serving our agent as an action-value function which it can exploit to pick the most optimal action from a given state.

There's a tradeoff between exploration (choosing a random action) and exploitation (choosing actions based on already learned Q-values). We want to prevent the action from always taking the same route, and possibly overfitting, so we'll be introducing another parameter called ϵ "epsilon" to cater to this during training.

Instead of just selecting the best learned Q-value action, we'll sometimes favor exploring the action space further. Lower epsilon value results in episodes with more penalties (on average) which is obvious because we are exploring and making random decisions.

In [148]:
#Initialize Q_table
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [149]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

frames = [] # for animation
# For plotting metrics
all_epochs = []
all_penalties = []

##Write your code here
for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        
         # Put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            }
        )
    
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

np.save("./q_table.npy", q_table)

Episode: 100000
Training finished.

Wall time: 1min 9s


## Visualization 

In [150]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)

Timestep: 103343
State: 214
Action: 1
Reward: -1


KeyboardInterrupt: 

In [151]:
#Load trained q_table for evaluation

q_table = np.load("./q_table.npy")

In [152]:
def create_loc_dict(city_df):
    loc_dict = {}
    
    ## Create dictionary example, loc_dict['dwarka sector 23] = 0  
    for  index,row in city_df.iterrows():
        
        loc_dict[row['location']] =  row['mapping']
        
    return loc_dict

In [153]:
def check_pick_up_drop_correction(pick_up, drop, line_num):
    #write your code here
        org_df = pd.read_csv("./org_df.csv")
        original_origin = org_df.iloc[line_num]['origin']
        original_destination = org_df.iloc[line_num]['dest']
#         print('original_origin :',original_origin)
#         print('pick_up :',pick_up)
#         print('Original Destination :',original_destination)
#         print('drop :',drop)
        #print('------------            -------------------')
        if original_origin == pick_up and original_destination == drop:
            return True
        else:
            print('original_origin :',original_origin)
            print('pick_up :',pick_up)
            print('Original Destination :',original_destination)
            print('drop :',drop)
            return False

In [154]:
"""Evaluate agent's performance after Q-learning"""

# 1) We need to take text drom "sms.txt" and fetch pickup and drop from it.
# 2) Generate the random state from an enviroment and change the pick up and drop as the fetched one
# 3) Evaluate you q_table performance on all the texts given in sms.txt.
# 4) Have a check if the fetched pickup, drop is not matching with original pickup, drop using orig.csv
# 5) If fetched pickup or/and drop does not match with the original, add penality and reward -10
# 6) Calculate the Total reward, penalities, Wrong pickup/drop predicted and Average time steps per episode.

total_epochs, total_penalties, total_reward, wrong_predictions = 0, 0, 0, 0


count = 0
time_list = []
f = open("./sms.txt", "r")
num_of_lines = 1000
city = pd.read_csv("./city.csv")
frames = [] # for animation
loc_dict = create_loc_dict(city)
line_num = 0
for line in f:
    lines=fetch_pickup_drop(line)
    pick_up=lines[0]
    drop=lines[1]
    decision=check_pick_up_drop_correction(pick_up, drop, line_num)
    
    if not decision:
        print(decision)
        total_penalties +=1
        reward = -10
        total_reward += reward
        wrong_predictions += 1
    pickUp_idx = loc_dict[pick_up]
    drop_idx = loc_dict[drop]

    act_state = env.reset()
    #print('Actual State --',act_state)
    taxi_row,taxi_col,pick_up,drop = env.decode(act_state)
    state = env.encode(taxi_row,taxi_col,int(pickUp_idx),int(drop_idx))
    #print('New State Generated -- ',state)
    epochs, penalties, reward = 0, 0, 0
    done = False

    while not done:
            action = np.argmax(q_table[state])
            #print('action--',action)
            state, reward, done, info = env.step(action)
            #print('Reward -',reward)
            if reward == -10:
                penalties += 1
            
            total_reward += reward
            # Put each rendered frame into dict for animation
            frames.append({
                'frame': env.render(mode='ansi'),
                'state': state,
                'action': action,
                'reward': reward
                }
            )
            
            epochs += 1

    total_penalties += penalties
    #wrong_predictions += penalties
    total_epochs += epochs
    
    line_num +=1


""" Printing Summary """

print(f"Results after {num_of_lines} episodes:")
print(f"Average timesteps per episode: {total_epochs / num_of_lines}")
print(f"Average penalties per episode: {total_penalties / num_of_lines}")
print(f"Total number of wrong predictions", wrong_predictions)
print("Total Reward is", total_reward)

Results after 1000 episodes:
Average timesteps per episode: 13.76
Average penalties per episode: 0.045
Total number of wrong predictions 0
Total Reward is 6835


##  Visualization 

In [None]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
print_frames(frames)