# Q-Learning implementation with NumPy

## Import all the necessary liabraries

In [1]:
import numpy as np

## Set the discount factor(Gamma) and the learning rate(alpha) respectively

In [2]:
gamma = 0.75

In [3]:
alpha = 0.9

## Define the States: Map the various locations (L1 TO L9) into numeric states (0 to 8)

In [4]:
locationToState = {
    'L1':0,
    'L2':1,
    'L3':2,
    'L4':3,
    'L5':4,
    'L6':5,
    'L7':6,
    'L8':7,
    'L9':8 
}

## Define the actions (the transitions between states s to s')

In [5]:
actions = [0,1,2,3,4,5,6,7,8]

## Define the Rewards table

In [6]:
rewards = np.array([
    [0,1,0,0,0,0,0,0,0],
    [1,0,1,0,0,0,0,0,0],
    [0,1,0,0,0,1,0,0,0],
    [0,0,0,0,0,0,1,0,0],
    [0,1,0,0,0,0,0,1,0],
    [0,0,1,0,0,0,0,0,0],
    [0,0,0,1,0,0,0,1,0],
    [0,0,0,0,1 ,0,1,0,1],
    [0,0,0,0,0,0,0,1,0]])

## Map indicies to locations

In [7]:
stateToLocation = {state: location for location, state in locationToState.items()}

## Define a function to get the optimum path from a start location to a destination

In [8]:
def get_optimal_route(startLocation, destination):
    # Copy the rewards table
    newRewards = np.copy(rewards)
    endState = locationToState[destination]
    newRewards[endState, endState] = 999

    # Initialize Q-table with proper dimensions
    Q = np.zeros(newRewards.shape)

    # Q-learning process
    for _ in range(1000):
        currentState = np.random.randint(0, len(locationToState))
        playableActions = [action for action in actions if newRewards[currentState, action] > 0]
        nextState = np.random.choice(playableActions)

        # Calculate the temporal difference (TD)
        TD = newRewards[currentState, nextState] + gamma * np.max(Q[nextState]) - Q[currentState, nextState]

        # Update the Q-value
        Q[currentState, nextState] += alpha * TD

    # Compute the optimal route
    route = [startLocation]
    nextLocation = startLocation

    while nextLocation != destination:
        currentState = locationToState[nextLocation]
        nextState = np.argmax(Q[currentState])
        nextLocation = stateToLocation[nextState]
        route.append(nextLocation)

    return route

## Test the above function with Locations 

In [9]:
route = get_optimal_route('L9', 'L1')
print(f"Optimal Route: {route}")


Optimal Route: ['L9', 'L8', 'L5', 'L2', 'L1']


In [10]:
route = get_optimal_route('L7', 'L1')
print(f"Optimal Route: {route}")

Optimal Route: ['L7', 'L8', 'L5', 'L2', 'L1']


In [11]:
route = get_optimal_route('L7', 'L3')
print(f"Optimal Route: {route}")

Optimal Route: ['L7', 'L8', 'L5', 'L2', 'L3']
