# Case Study_: Q Learning


# Task 1:Problem Statement
## Optimizing Warehouse Flows with Q-Learning


In [3]:
# import the libraries
import numpy as np
import pandas as pd

In [4]:
# Setting the parameters gamma and alpha for the Q-Learning
gamma = 0.75
alpha = 0.9

In [5]:
# PART 1 - DEFINING THE ENVIRONMENT
# Defining the states
location_to_state = {'A': 0,
                     'B': 1,
                     'C': 2,
                     'D': 3,
                     'E': 4,
                     'F': 5,
                     'G': 6,
                     'H': 7,
                     'I': 8,
                     'J': 9,
                     'K': 10,
                     'L': 11}


In [6]:
# Defining the actions
actions = [0,1,2,3,4,5,6,7,8,9,10,11]

In [7]:
# Defining the rewards
R=np.array([[0,1,0,0,0,0,0,0,0,0,0,0],
[1,0,1,0,0,1,0,0,0,0,0,0],
[0,1,0,0,0,0,1,0,0,0,0,0],
[0,0,0,0,0,0,0,1,0,0,0,0],
[0,0,0,0,0,0,0,0,1,0,0,0],
[0,1,0,0,0,0,0,0,0,1,0,0],
[0,0,1,0,0,0,1000,1,0,0,0,0],
[0,0,0,1,0,0,1,0,0,0,0,1],
[0,0,0,0,1,0,0,0,0,1,0,0],
[0,0,0,0,0,1,0,0,1,0,1,0],
[0,0,0,0,0,0,0,0,0,1,0,1],
[0,0,0,0,0,0,0,1,0,0,1,0]])

In [8]:
def print_rewards(data):
  locations = ['A','B','C','D','E','F','G','H','I','J','K','L']
  df_R = pd.DataFrame(data, index=locations,columns=locations)
  print(df_R)

print_rewards(R)

   A  B  C  D  E  F     G  H  I  J  K  L
A  0  1  0  0  0  0     0  0  0  0  0  0
B  1  0  1  0  0  1     0  0  0  0  0  0
C  0  1  0  0  0  0     1  0  0  0  0  0
D  0  0  0  0  0  0     0  1  0  0  0  0
E  0  0  0  0  0  0     0  0  1  0  0  0
F  0  1  0  0  0  0     0  0  0  1  0  0
G  0  0  1  0  0  0  1000  1  0  0  0  0
H  0  0  0  1  0  0     1  0  0  0  0  1
I  0  0  0  0  1  0     0  0  0  1  0  0
J  0  0  0  0  0  1     0  0  1  0  1  0
K  0  0  0  0  0  0     0  0  0  1  0  1
L  0  0  0  0  0  0     0  1  0  0  1  0


In [9]:
# PART 2 - BUILDING THE AI SOLUTION WITH Q-LEARNING
# Initializing the Q-Values to create the original qtable
Q = np.array(np.zeros([12,12]))

print_rewards(Q)

     A    B    C    D    E    F    G    H    I    J    K    L
A  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
B  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
C  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
D  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
E  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
F  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
G  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
H  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
I  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
J  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
K  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
L  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0


In [10]:
# Implementing the Q-Learning process
for i in range(1000):
    current_state = np.random.randint(0,12)
    playable_actions = []
    for j in range(12):
        if R[current_state, j] > 0:
            playable_actions.append(j)
    next_state = np.random.choice(playable_actions)
    TD = R[current_state, next_state] + gamma*Q[next_state, np.argmax(Q[next_state,])] - Q[current_state, next_state]
    Q[current_state, next_state] = Q[current_state, next_state] + alpha*TD

In [11]:
print("Q-Values:")

print_rewards(Q.astype(int))

Q-Values:
      A     B     C     D    E     F     G     H    I    J     K     L
A     0  1684     0     0    0     0     0     0    0    0     0     0
B  1258     0  2247     0    0  1263     0     0    0    0     0     0
C     0  1684     0     0    0     0  2995     0    0    0     0     0
D     0     0     0     0    0     0     0  2245    0    0     0     0
E     0     0     0     0    0     0     0     0  713    0     0     0
F     0  1684     0     0    0     0     0     0    0  948     0     0
G     0     0  2243     0    0     0  3995  2241    0    0     0     0
H     0     0     0  1684    0     0  2992     0    0    0     0  1684
I     0     0     0     0  535     0     0     0    0  949     0     0
J     0     0     0     0    0  1264     0     0  712    0  1263     0
K     0     0     0     0    0     0     0     0    0  949     0  1684
L     0     0     0     0    0     0     0  2245    0    0  1262     0


In [12]:
# PART 3 - GOING INTO PRODUCTION
# Making a mapping from the states to the locations
state_to_location = {state: location for location, state in location_to_state.items()}
state_to_location

{0: 'A',
 1: 'B',
 2: 'C',
 3: 'D',
 4: 'E',
 5: 'F',
 6: 'G',
 7: 'H',
 8: 'I',
 9: 'J',
 10: 'K',
 11: 'L'}

In [32]:
def route(starting_location, ending_location):
    route = [starting_location]
    next_location = starting_location
    visited = set()
    steps = 0
    while next_location != ending_location and steps < 20:
        starting_state = location_to_state[starting_location]
        next_state = np.argmax(Q[starting_state,])
        next_location = state_to_location[next_state]
        if next_location in visited:
            print("Loop detected — aborting.")
            break
        route.append(next_location)
        visited.add(next_location)
        starting_location = next_location
        steps += 1
    return route


In [34]:
# Printing the final route
print('Route:')
route('L', 'D')

Route:


TypeError: route() missing 3 required positional arguments: 'Q', 'location_to_state', and 'state_to_location'