# Case Study_: Q Learning


# Task 1:Problem Statement
## Optimizing Warehouse Flows with Q-Learning


In [3]:
# import the libraries
import numpy as np
import pandas as pd

In [4]:
# Setting the parameters gamma and alpha for the Q-Learning
gamma = 0.75
alpha = 0.9

In [5]:
# PART 1 - DEFINING THE ENVIRONMENT
# Defining the states
location_to_state = {'A': 0,
                     'B': 1,
                     'C': 2,
                     'D': 3,
                     'E': 4,
                     'F': 5,
                     'G': 6,
                     'H': 7,
                     'I': 8,
                     'J': 9,
                     'K': 10,
                     'L': 11}


In [6]:
# Defining the actions
actions = [0,1,2,3,4,5,6,7,8,9,10,11]

In [7]:
# Defining the rewards
R=np.array([[0,1,0,0,0,0,0,0,0,0,0,0],
[1,0,1,0,0,1,0,0,0,0,0,0],
[0,1,0,0,0,0,1,0,0,0,0,0],
[0,0,0,0,0,0,0,1,0,0,0,0],
[0,0,0,0,0,0,0,0,1,0,0,0],
[0,1,0,0,0,0,0,0,0,1,0,0],
[0,0,1,0,0,0,1000,1,0,0,0,0],
[0,0,0,1,0,0,1,0,0,0,0,1],
[0,0,0,0,1,0,0,0,0,1,0,0],
[0,0,0,0,0,1,0,0,1,0,1,0],
[0,0,0,0,0,0,0,0,0,1,0,1],
[0,0,0,0,0,0,0,1,0,0,1,0]])

In [8]:
def print_rewards(data):
  locations = ['A','B','C','D','E','F','G','H','I','J','K','L']
  df_R = pd.DataFrame(data, index=locations,columns=locations)
  print(df_R)

print_rewards(R)

   A  B  C  D  E  F     G  H  I  J  K  L
A  0  1  0  0  0  0     0  0  0  0  0  0
B  1  0  1  0  0  1     0  0  0  0  0  0
C  0  1  0  0  0  0     1  0  0  0  0  0
D  0  0  0  0  0  0     0  1  0  0  0  0
E  0  0  0  0  0  0     0  0  1  0  0  0
F  0  1  0  0  0  0     0  0  0  1  0  0
G  0  0  1  0  0  0  1000  1  0  0  0  0
H  0  0  0  1  0  0     1  0  0  0  0  1
I  0  0  0  0  1  0     0  0  0  1  0  0
J  0  0  0  0  0  1     0  0  1  0  1  0
K  0  0  0  0  0  0     0  0  0  1  0  1
L  0  0  0  0  0  0     0  1  0  0  1  0


In [9]:
# PART 2 - BUILDING THE AI SOLUTION WITH Q-LEARNING
# Initializing the Q-Values to create the original qtable
Q = np.array(np.zeros([12,12]))

print_rewards(Q)

     A    B    C    D    E    F    G    H    I    J    K    L
A  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
B  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
C  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
D  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
E  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
F  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
G  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
H  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
I  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
J  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
K  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
L  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0


In [10]:
# Implementing the Q-Learning process
for i in range(1000):
    current_state = np.random.randint(0,12)
    playable_actions = []
    for j in range(12):
        if R[current_state, j] > 0:
            playable_actions.append(j)
    next_state = np.random.choice(playable_actions)
    TD = R[current_state, next_state] + gamma*Q[next_state, np.argmax(Q[next_state,])] - Q[current_state, next_state]
    Q[current_state, next_state] = Q[current_state, next_state] + alpha*TD

In [11]:
print("Q-Values:")

print_rewards(Q.astype(int))

Q-Values:
      A     B     C     D    E     F     G     H    I    J     K     L
A     0  1688     0     0    0     0     0     0    0    0     0     0
B  1267     0  2250     0    0  1267     0     0    0    0     0     0
C     0  1688     0     0    0     0  2999     0    0    0     0     0
D     0     0     0     0    0     0     0  2250    0    0     0     0
E     0     0     0     0    0     0     0     0  714    0     0     0
F     0  1688     0     0    0     0     0     0    0  951     0     0
G     0     0  2249     0    0     0  3998  2250    0    0     0     0
H     0     0     0  1688    0     0  2999     0    0    0     0  1688
I     0     0     0     0  536     0     0     0    0  951     0     0
J     0     0     0     0    0  1267     0     0  714    0  1267     0
K     0     0     0     0    0     0     0     0    0  951     0  1688
L     0     0     0     0    0     0     0  2250    0    0  1267     0


In [12]:
# PART 3 - GOING INTO PRODUCTION
# Making a mapping from the states to the locations
state_to_location = {state: location for location, state in location_to_state.items()}
state_to_location

{0: 'A',
 1: 'B',
 2: 'C',
 3: 'D',
 4: 'E',
 5: 'F',
 6: 'G',
 7: 'H',
 8: 'I',
 9: 'J',
 10: 'K',
 11: 'L'}

In [39]:
def route(starting_location, ending_location):
    route = [starting_location]
    total_reward = 0
    visited = set()

    while starting_location != ending_location:
        current_state = location_to_state[starting_location]
        
        # Mask visited states
        q_values = Q[current_state].copy()
        for loc in visited:
            q_values[location_to_state[loc]] = -np.inf

        next_state = np.argmax(q_values)
        next_location = state_to_location[next_state]

        if next_location in visited or q_values[next_state] <= 0:
            print("Loop detected or dead end. Exiting.")
            break

        total_reward += Q[current_state, next_state]
        visited.add(next_location)
        route.append(next_location)
        starting_location = next_location

    print("Total Reward:", total_reward)
    return route


In [41]:
# Printing the final route
print('Route:')
route('L', 'G')

Route:
Total Reward: 5249.503658936057


['L', 'H', 'G']

In [43]:
location_to_state = {
    'A': 0, 'B': 1, 'C': 2, 'D': 3,
    'E': 4, 'F': 5, 'G': 6, 'H': 7,
    'I': 8, 'J': 9, 'K': 10, 'L': 11
}


In [45]:
route = ['L', 'H', 'G']
is_optimal = True

for i in range(len(route)-1):
    current = location_to_state[route[i]]
    expected_next = location_to_state[route[i+1]]
    best_next = np.argmax(Q[current])
    
    if expected_next != best_next:
        print(f"❌ From {route[i]}, best move is {list(location_to_state.keys())[best_next]}, not {route[i+1]}")
        is_optimal = False
    else:
        print(f"✅ From {route[i]}, best move is {route[i+1]}")

if is_optimal:
    print("🎯 Route is optimal")
else:
    print("⚠️ Route is suboptimal")

✅ From L, best move is H
✅ From H, best move is G
🎯 Route is optimal


In [47]:
for i in range(len(route)-1):
    s = location_to_state[route[i]]
    a = location_to_state[route[i+1]]
    print(f"Q({route[i]} → {route[i+1]}) = {Q[s, a]}")


Q(L → H) = 2250.358453156494
Q(H → G) = 2999.1452057795636
