# Question 6

In [1]:
import numpy as np
import pandas as pd
import pickle

np.random.seed(1)

In this question, we will try to come up with a faster and better solution for the logistics problem we have. While training both policy iteration and tabular q-learning algorithm in question 5, we struggled with the problem of very long training times and curses of dimensionality because of the big state and action space we had. To overcome this issue, we will make use of the observations we made from  our nearly optimal q-learning model we trained in question 5. First and most obvious observation that we can make for this question is that the route (1,2,3) is very costly and it should never be used since its cost is much higher than just bearing loss sales. Therefore, we will exclude this route from the problem in this part. By doing this, we will reduce action space from having 1331 elements (all combination of 3 integers between 0-10) to 331 elements (by subtracting  all combinations of 3 integers between 1-10).
<br>

We will also carry out a sensitive analysis to see how different cost values affects the decisions of our q-learning algorithm makes. To achieve this, we will train 3 different versions of this algorithm:

- Original Version (excluding the route (1,2,3))
- Original Version with holding cost being 10 instead of 1
- Original version with higher route costs for routes (1,2), (1,3), (2,3)

# Original Version (excluding the route (1,2,3))

Below code is the same as the one we used to train the q-learning in question 5. Therefore, you can refer there for the explanation of the code.

In [2]:
# state space
states = np.array([np.array([i,j,z]) for i in range(11) for j in range(11) for z in range(11)]) # inventory before the start of the day
state_index = {}
for i, s in enumerate(states):
    state_index[tuple(s)] = i

def inv_r(inv_level):
    inv_reward1 = (max(0, inv_level[0]) * -1) + (min(0, inv_level[0]) * 19)
    inv_reward2 = (max(0, inv_level[1]) * -1) + (min(0, inv_level[1]) * 19)
    inv_reward3 = (max(0, inv_level[2]) * -1) + (min(0, inv_level[2]) * 19)
    direct_reward = inv_reward1 + inv_reward2 + inv_reward3
    return direct_reward

def route_r(action):
    if action[0] > 0 and action[1] > 0 and action[2] > 0:
        route_reward = -500
    elif action[0] > 0 and action[1] > 0:
        route_reward = -60
    elif action[0] > 0 and action[2] > 0:
        route_reward = -75
    elif action[1] > 0 and action[2] > 0:
        route_reward = -75
    elif action[0] > 0:
        route_reward = -40
    elif action[1] > 0:
        route_reward = -40
    elif action[2] > 0:
        route_reward = -55
    else:
        route_reward = 0
    return route_reward

def get_route(action):
    if action[0] > 0 and action[1] > 0 and action[2] > 0:
        return "(1, 2, 3)"
    elif action[0] > 0 and action[1] > 0:
        return "(1, 2)"
    elif action[0] > 0 and action[2] > 0:
        return "(1,3)"
    elif action[1] > 0 and action[2] > 0:
        return "(2, 3)"
    elif action[0] > 0:
        return "(1)"
    elif action[1] > 0:
        return "(2)"
    elif action[2] > 0:
        return "(3)"
    else:
        return "(0)"

def generate_demand():
    d1 = min(10, np.ceil(np.random.gamma(shape=9, scale=1/3)))
    d2 = min(10, np.ceil(np.random.gamma(shape=12.5, scale=0.4)))
    d3 = min(10, np.ceil(np.random.gamma(shape=4/3, scale=1.5)))
    return np.array([d1, d2, d3])

get_actions = {}
for s in states:
    max_purchase = 10 - s
    actions = np.array([np.array([i,j,z]) for i in range(max_purchase[0]+1) for j in range(max_purchase[1]+1) for z in range(max_purchase[2]+1) if not np.array([i,j,z]).all() > 0])
    get_actions[tuple(s)] = actions

# initializing the Q(s, a) table
try:
    with open("q_dict_q6.pkl", "rb") as fl:
        q_table = pickle.load(fl)
        print("using pre-trained q table")
except:
    print("using a new q table")
    np.random.seed(seed=1)
    q_table = {}
    for s in states:
        actions = get_actions[tuple(s)]
        q_table[tuple(s)] = {}
        for a in actions:
            q_table[tuple(s)][tuple(a)] = np.random.rand()

epsilon = 0.1
alpha = 0.5
route_frequency = {}
for n in range(1):
    # randomly initializing the state
    s = np.random.randint(low=0, high=11, size=3)
    order_up_to = []

    # if you are not going to use the pre-trained q-table,
    # we advise you to change the # of iterations to at least 1 million.
    for _ in range(300):
        tuple_s = tuple(s)
        q_dict = q_table[tuple_s]
        index = state_index[tuple_s]
        if np.random.rand() < epsilon:
            A = get_actions[tuple_s]
            action = A[np.random.randint(low=0, high=len(A))]
        else:
            action = max(q_dict, key=q_dict.get)
        route = get_route(action)
        route_frequency[route] = route_frequency.get(route, 0) + 1
        d = generate_demand()
        new_s = s + action  - d
        reward = route_r(action) + inv_r(new_s)
        new_s[new_s<0] = 0
        new_q = max(q_table[tuple(new_s)].values())
        tuple_a = tuple(action)
        old_q = q_dict[tuple_a]
        q_table[tuple_s][tuple_a] += alpha * (reward + 0.8 * new_q - old_q)
        order_up_to.append(s+action)
        #print(f"state: {s}, action: {action}, inv: {s+action-d}, demand: {d}, reward: {reward}")
        s = np.asarray(new_s.copy()).astype(int)
print(f"mean order up to level: {np.round(np.array(order_up_to).mean(axis=0), 2)}")
pd.DataFrame([route_frequency])
#with open("q_dict_q6.pkl", "wb") as fl:
#    pickle.dump(q_table, fl)

using pre-trained q table
mean order up to level: [6.95 8.16 7.51]


Unnamed: 0,(0),"(1,3)","(1, 2)","(2, 3)",(2),(3),(1)
0,10,11,140,101,27,4,7


# Original Version with holding cost being 10 instead of 1

In [3]:
# here, we changed inventory holding cost to -10 from -1
def inv_r(inv_level):
    inv_reward1 = (max(0, inv_level[0]) * -10) + (min(0, inv_level[0]) * 19)
    inv_reward2 = (max(0, inv_level[1]) * -10) + (min(0, inv_level[1]) * 19)
    inv_reward3 = (max(0, inv_level[2]) * -10) + (min(0, inv_level[2]) * 19)
    direct_reward = inv_reward1 + inv_reward2 + inv_reward3
    return direct_reward

# initializing the Q(s, a) table
try:
    with open("q_dict_cost1.pkl", "rb") as fl:
        q_table = pickle.load(fl)
        print("using pre-trained q table")
except:
    print("using a new q table")
    np.random.seed(seed=1)
    q_table = {}
    for s in states:
        actions = get_actions[tuple(s)]
        q_table[tuple(s)] = {}
        for a in actions:
            q_table[tuple(s)][tuple(a)] = np.random.rand()

epsilon = 0.
alpha = 0.5
route_frequency ={}
for n in range(1):
    # randomly initializing the state
    s = np.random.randint(low=0, high=11, size=3)
    order_up_to = []

    # if you are not going to use the pre-trained q-table,
    # we advise you to change the # of iterations to at least 1 million.
    for _ in range(300):
        tuple_s = tuple(s)
        q_dict = q_table[tuple_s]
        index = state_index[tuple_s]
        if np.random.rand() < epsilon:
            A = get_actions[tuple_s]
            action = A[np.random.randint(low=0, high=len(A))]
        else:
            action = max(q_dict, key=q_dict.get)
        route = get_route(action)
        route_frequency[route] = route_frequency.get(route, 0) + 1
        d = generate_demand()
        new_s = s + action  - d
        reward = route_r(action) + inv_r(new_s)
        new_s[new_s<0] = 0
        new_q = max(q_table[tuple(new_s)].values())
        tuple_a = tuple(action)
        old_q = q_dict[tuple_a]
        q_table[tuple_s][tuple_a] += alpha * (reward + 0.8 * new_q - old_q)

        order_up_to.append(s+action)
        s = np.asarray(new_s.copy()).astype(int)
print(f"mean order up to level: {np.round(np.array(order_up_to).mean(axis=0), 2)}")
pd.DataFrame([route_frequency])
#with open("q_dict_cost1.pkl", "wb") as fl:
#    pickle.dump(q_table, fl)

using pre-trained q table
mean order up to level: [4.54 6.22 2.74]


Unnamed: 0,(0),(2),"(1, 2)","(1,3)","(2, 3)",(1),(3)
0,3,18,154,19,91,10,5


# Original version with higher route costs for routes (1,2), (1,3), (2,3)

We changed costs for the following routes to the values next to them.
- Route (1, 2): 150 <br>
- Route (1, 3): 180 <br>
- Route (2, 3): 180 <br>


In [4]:
def inv_r(inv_level):
    inv_reward1 = (max(0, inv_level[0]) * -1) + (min(0, inv_level[0]) * 19)
    inv_reward2 = (max(0, inv_level[1]) * -1) + (min(0, inv_level[1]) * 19)
    inv_reward3 = (max(0, inv_level[2]) * -1) + (min(0, inv_level[2]) * 19)
    direct_reward = inv_reward1 + inv_reward2 + inv_reward3
    return direct_reward

def route_r(action):
    if action[0] > 0 and action[1] > 0 and action[2] > 0:
        route_reward = -500
    elif action[0] > 0 and action[1] > 0:
        route_reward = -150
    elif action[0] > 0 and action[2] > 0:
        route_reward = -180
    elif action[1] > 0 and action[2] > 0:
        route_reward = -180
    elif action[0] > 0:
        route_reward = -40
    elif action[1] > 0:
        route_reward = -40
    elif action[2] > 0:
        route_reward = -55
    else:
        route_reward = 0
    return route_reward

# initializing the Q(s, a) table
try:
    with open("q_dict_cost2.pkl", "rb") as fl:
        q_table = pickle.load(fl)
        print("using pre-trained q table")
except:
    print("using a new q table")
    np.random.seed(seed=1)
    q_table = {}
    for s in states:
        actions = get_actions[tuple(s)]
        q_table[tuple(s)] = {}
        for a in actions:
            q_table[tuple(s)][tuple(a)] = np.random.rand()

epsilon = 0.5
alpha = 0.5
route_frequency = {}
for n in range(1):
    # randomly initializing the state
    s = np.random.randint(low=0, high=11, size=3)
    order_up_to = []

    # if you are not going to use the pre-trained q-table,
    # we advise you to change the # of iterations to at least 1 million.
    for _ in range(300):
        tuple_s = tuple(s)
        q_dict = q_table[tuple_s]
        index = state_index[tuple_s]
        if np.random.rand() < epsilon:
            A = get_actions[tuple_s]
            action = A[np.random.randint(low=0, high=len(A))]
        else:
            action = max(q_dict, key=q_dict.get)
        route = get_route(action)
        route_frequency[route] = route_frequency.get(route, 0) + 1
        d = generate_demand()
        new_s = s + action  - d
        reward = route_r(action) + inv_r(new_s)
        new_s[new_s<0] = 0
        new_q = max(q_table[tuple(new_s)].values())
        tuple_a = tuple(action)
        old_q = q_dict[tuple_a]
        q_table[tuple_s][tuple_a] += alpha * (reward + 0.8 * new_q - old_q)

        order_up_to.append(s+action)
        s = np.asarray(new_s.copy()).astype(int)
print(f"mean order up to level: {np.round(np.array(order_up_to).mean(axis=0), 2)}")
pd.DataFrame([route_frequency])
#with open("q_dict_cost2.pkl", "wb") as fl:
#    pickle.dump(q_table, fl)

using pre-trained q table
mean order up to level: [5.41 6.11 5.39]


Unnamed: 0,"(2, 3)",(2),"(1, 2)","(1,3)",(3),(1),(0)
0,54,71,77,38,26,31,3
