#  Reinforcement-learning
this code demonstrates an example use of a reinforcement-learning-based algorithm

the following python libraries are required to use this program: 
- numpy
- Markov Decision Process (MDP) Toolbox
- itertools


#### import the dependencies

In [1]:
import numpy as np
import mdptoolbox
import itertools

import warnings
warnings.filterwarnings("ignore")

#### read in the data

In [2]:
train_file = open("./data/Exercise 4 - Reinforcement Learning Data - warehousetraining.txt","r")
test_file = open("./data/Exercise 4 - Reinforcement Learning Data - warehouseorder.txt","r")

# dict to count unique values
train_data = {}
for line in train_file:
    # remove special chars
    line = line.replace("\t"," ")
    line = line.replace("\n","")
    if line not in train_data:
        # create key in dict
        train_data[line] = 0
    train_data[line] += 1
    
test_data = []
for line in test_file:
    line = line.replace("\t"," ")
    line = line.replace("\n","")
    test_data.append(line)

In [3]:
print("Train Data Count ", train_data)
sum_train_data = sum(train_data.values())
probability_dict = train_data.copy()
for key, value in probability_dict.items(): 
    probability_dict[key] /= sum_train_data
print("Train Data Probabilities ", probability_dict)
print("Train Data Samples ", sum_train_data)

Train Data Count  {'store red': 2989, 'store blue': 1517, 'store white': 1548, 'restore blue': 1517, 'restore white': 1548, 'restore red': 2989}
Train Data Probabilities  {'store red': 0.24686157912124215, 'store blue': 0.12528906508093823, 'store white': 0.1278493557978196, 'restore blue': 0.12528906508093823, 'restore white': 0.1278493557978196, 'restore red': 0.24686157912124215}
Train Data Samples  12108


In [4]:
#### setup the warehouse

In [5]:
rewards = [6,3,3,1]
costs = [1,2,2,3]
fields = [[0, 0], [0, 1], [1, 0], [1, 1]]
fields_states = ['empty', 'red', 'blue', 'white']
empty_warehouse = ['empty','empty','empty','empty']
jobs = ['store', 'restore']
warehouse_size = len(fields)

In [6]:
#### get all possible states

In [119]:
def create_all_possible_states(fields, fields_states, warehouse_size):  
    fieldStates = itertools.product(fields_states, repeat=warehouse_size)
    field_states = [list(item) for item in fieldStates]
    
    all_states = []
    for key in probability_dict.keys():
        for field_state in field_states:
            all_states.append([key, field_state])
    return all_states

all_states = create_all_possible_states(fields, fields_states, warehouse_size)
step_size = len(all_states)/ len(probability_dict.keys())
print("Number of all possible states", len(all_states))

Number of all possible states 1536


#### create the transaction and reward matrix

In [120]:
def transition_allowed(a, state1, state2):
    operation, rest = state1
    splits = operation.split(" ")
    job, color = splits
    
    operation2, rest2 = state2
    splits2 = operation2.split(" ")
    job2, color2 = splits2
    
    if job.startswith('store'):
        if rest[a] == 'empty' and rest2[a] != color:
            return False
        if rest[a] != 'empty' and rest[a] != rest2[a]:
            return False
    else:
        # restore
        if rest[a] != color and rest[a] != rest2[a]:
            return False
        if rest[a] == color and rest2[a] != 'empty':
            return False
        
    return True



def create_T_matrix():
    T = np.zeros((warehouse_size, len(all_states), len(all_states)))
    
    for a in range(len(fields)):
        for s1, state1 in enumerate(all_states):
            for s2, state2 in enumerate(all_states):
                if transition_allowed(a, state1, state2):
                    operation = state2[0]
                    T[a][s1][s2] = round(probability_dict[operation],4)
    
    for a, rows in enumerate(T):
        for b, row in enumerate(rows):
            row_sum = np.sum(row)
            if row_sum == 0:
                T[a][b][b] = 1
            # every row in the transaction matrix has in sum to be equal 1
            T[a] = rows / rows.sum(axis=1)[:, None]
    return T


def create_R_matrix():
    R = np.zeros((len(all_states), warehouse_size))
    for i, state in enumerate(all_states):
        operation, rest = state
        splits = operation.split(" ")
        job, color = splits
        reward = np.zeros(warehouse_size)
        # check what job is performed and if its possible
        if job == "store":
            for j in range(warehouse_size):
                if rest[j] != "empty":
                    reward[j] = -100
                else:
                    reward[j] = rewards[j]
    
        else:
            for j in range(warehouse_size):
                if rest[j] != color:
                    reward[j] = -100
                else:
                    reward[j] = rewards[j]
        R[i] = reward
    return R
T = create_T_matrix()
R = create_R_matrix()



In [108]:
#### fit the models

In [121]:
mdp_policy = mdptoolbox.mdp.PolicyIteration(T, R, 0.95, max_iter=1000)
mdp_value = mdptoolbox.mdp.ValueIteration(T, R, 0.95, max_iter=1000)
mdp_policy.run()
mdp_value.run()

#### implementation of a greedy approach for comparison

In [122]:
def greedy_approach(warehouse, operations):
    dist = 0
    for operation in operations:
        operation = operation.split(' ')
        action, color = operation
        
        if action == "store":
            for x in range(len(warehouse)):
                if warehouse[x] == "empty":
                    warehouse[x] = color
                    dist += costs[x] 
        else:
            for y in range(len(warehouse)):
                if warehouse[y] == color:
                    warehouse[y] = "empty"
                    dist += costs[y] 
    return dist



#### implemention of a policy distance metric

In [123]:
def distance_metric(all_states, model, warehouse, operations):
    dist = 0
    for operation in operations:
        operation = operation.split(' ')
        job, color = operation
        idx = -1
        for i, state in enumerate(all_states):
            if idx != -1:
                break
            for j in range(len(warehouse)):
                op, fields = state
                if fields[j]!=warehouse[j]:
                    break
                if op != job:
                    break
                else:
                    i = idx
        index = int(model[idx])
        if job == "store":
            warehouse[index] = color
            dist += costs[index] 
        else:
            warehouse[index] = 'empty'
            dist += costs[index]
    return dist

#### Evaluation - compare the different distances

In [124]:
greedy_distance = greedy_approach(empty_warehouse.copy(), test_data)
mdp_policy_distance = distance_metric(all_states, mdp_policy.policy, empty_warehouse.copy(), test_data)
mdp_value_distance = distance_metric(all_states, mdp_value.policy,empty_warehouse.copy(), test_data)
print("Greedy approach distance", greedy_distance)
print("MDP Policy distance", mdp_policy_distance)
print("MDP Value distance", mdp_value_distance)

Greedy approach distance 144
MDP Policy distance 180
MDP Value distance 180
