# SAKI Homework 4 SS21

## Implemention of a reinforcement loearning based algorith for a warehouse organizing robot

## Package import

In [1]:
import numpy as np
import pandas as pd
import csv
import itertools
import mdptoolbox
import os

## Data loading

In [101]:
current_working_directory = os.getcwd()
path_to_warehouse_training_file = os.path.join(current_working_directory, "warehousetraining.txt")
path_to_warehouse_order_file = os.path.join(current_working_directory, 'warehouseorder.txt')
warehouse_training = pd.read_csv(path_to_warehouse_training_file, names=["action", "color"], delimiter="\t")
warehouse_order = pd.read_csv(path_to_warehouse_order_file, names=["action", "color"], delimiter="\t")

## Exploration of the data set to examine the probabilities of the occurances of colors and actions

In [3]:
num_training_samples = len(warehouse_training)
probability_evaluation = warehouse_training.groupby(['action','color']).size().reset_index(name='num')
probability_evaluation['probability'] = probability_evaluation['num']/num_training_samples
probability_evaluation

Unnamed: 0,action,color,num,probability
0,restore,blue,1517,0.125289
1,restore,red,2989,0.246862
2,restore,white,1548,0.127849
3,store,blue,1517,0.125289
4,store,red,2989,0.246862
5,store,white,1548,0.127849


## Definition of the 6 actions in our warehouse

### Definition of actions
We have three colors in our warehouse and two actions (store and restore).

In [4]:
list_of_actions = ['store_red', 'store_blue', 'store_white', 'restore_red','restore_blue', 'restore_white']

### Definition of States

In [59]:
items = ['red', 'blue','white', 'empty']
state_matrix = list(itertools.product(items,items, items, items, list_of_actions))

### Definition of th model
Our model consists of four cells, as we have a 2x2 warehouse matrix. Every cell is indexed by the numbers from 0 to 3.

In [6]:
cell_index = np.arange(0,4)
cell_index

array([0, 1, 2, 3])

The costs are defined based on the distance the roboter has to walk to the cell. We have a 2x2 field and the robot cannot walk diagonally. Thus, the minimum cost is 1 and the maximum one is 3.

In [7]:
distance_costs = [1,2,2,3]

### Definition of the tranbsition probability matrix
Based on the probabilities of the occurances of the actions in the dataset, we create a transition probability matrix. It indicates the probability to change from one state to another based on the distribution of the training data set. 

In [8]:
probability_store_red = probability_evaluation.loc[(probability_evaluation['color'] == 'red') & (probability_evaluation['action'] == 'store'), 'probability'].item()
probability_store_blue = probability_evaluation.loc[(probability_evaluation['color'] == 'blue') & (probability_evaluation['action'] == 'store'), 'probability'].item()
probability_store_white = probability_evaluation.loc[(probability_evaluation['color'] == 'white') & (probability_evaluation['action'] == 'store'), 'probability'].item()
probability_restore_red = probability_evaluation.loc[(probability_evaluation['color'] == 'red') & (probability_evaluation['action'] == 'restore'), 'probability'].item()
probability_restore_blue = probability_evaluation.loc[(probability_evaluation['color'] == 'blue') & (probability_evaluation['action'] == 'restore'), 'probability'].item()
probability_restore_white = probability_evaluation.loc[(probability_evaluation['color'] == 'white') & (probability_evaluation['action'] == 'restore'), 'probability'].item()

In [9]:
def is_valid_transation(cell_id, current_state, next_state):
    current_action = current_state[4]
    if current_state[-1] == 'restore_red' or current_state[-1] == 'restore_blue' or current_state[-1] == 'restore_white':
        if not current_state[-1] in current_state[:4:]:
        # no item to restore is in 2x2 warehouse
            return False

        if current_state[cell_id] == current_state[-1] and next_state[cell_id] == 'empty':
            # item found and next state is then empty
            return True
        
        return False
    else:
        # warehouse already completely full
        if not 'empty' in current_state:
            return False
 
        # warehouse has empty cell and correct color is in next state
        if current_state[cell_id] == 'empty' and next_state[cell_id] == current_state[-1].split(sep='_')[-1]:
            return True

        return False

If there is a row consisting only of zeros, we will set a 100% = 1 transition probability to stay in the current state. Moreover, we normalize the probabilities of entering in another state, so that these sum up to 1.

In [16]:
def normalize(matrix):
    for index, row in enumerate(matrix):
        if np.sum(row) == 0:
            matrix[index, index] = 1
            continue
    corrected_matrix = matrix / matrix.sum(axis=1)[:, np.newaxis]
    return corrected_matrix

In [41]:
tmp_matrix = []
for cell_id in cell_index:
    tpm_per_cell = np.zeros((len(state_matrix), len(state_matrix)), dtype=np.float16)
    for i, current_state in enumerate(state_matrix):
        for j, next_state in enumerate(state_matrix):
            if is_valid_transation(cell_id, current_state, next_state):
                next_action = next_state[4]
                if next_action == 'store_red':
                    tpm_per_cell[i, j] = probability_store_red
                elif next_action == 'store_blue':
                    tpm_per_cell[i, j] = probability_store_blue
                elif next_action == 'store_white':
                    tpm_per_cell[i, j] = probability_store_white
                elif next_action == 'restore_red':
                    tpm_per_cell[i, j] = probability_restore_red
                elif next_action == 'restore_white':
                    tpm_per_cell[i, j] = probability_restore_white
                else:
                    tpm_per_cell[i, j] = probability_restore_blue
        
    tmp_matrix.append(normalize(tpm_per_cell))
    

### Definition of the reward matrix

In [165]:
rewards = [3,2,2,1]
def create_reward_matrix():
    rewards_matrix = np.zeros((len(state_matrix), len(cell_index)))
    for cell_id in cell_index:
        for i, state in enumerate(state_matrix):
            if state[-1][0:8] == 'restore_':
                #restoring if color is suiting
                if state[cell_id] == state[-1][9:]:
                    rewards_matrix[i, cell_id] = rewards[cell_id] 
                
            elif state[-1][0:6] == 'store_':
                # Storing possible if empty cell there
                if state[cell_id] =='empty':
                    rewards_matrix[i, cell_id] = rewards[cell_id]
            else:
                # penalty
                rewards_matrix[i, cell_id] = -4

    return reward_matrix

In [166]:
reward_matrix = create_reward_matrix()

### Markov Decision Process Toolbox for policy generation

In [48]:
mdpResultPolicy = mdptoolbox.mdp.PolicyIteration(tmp_matrix, reward_matrix, 0.3, max_iter=300)
mdpResultValue = mdptoolbox.mdp.ValueIteration(tmp_matrix, reward_matrix, 0.3, max_iter=300)

mdpResultPolicy.run()
mdpResultValue.run()

In [164]:
'''print('PolicyIteration:')
print('PolicyIteration.policy:')
print(mdpResultPolicy.policy)
print('PolicyIteration.V:')
print(mdpResultPolicy.V)
print('PolicyIteration.iter:')
print(mdpResultPolicy.iter)
'''

"print('PolicyIteration:')\nprint('PolicyIteration.policy:')\nprint(mdpResultPolicy.policy)\nprint('PolicyIteration.V:')\nprint(mdpResultPolicy.V)\nprint('PolicyIteration.iter:')\nprint(mdpResultPolicy.iter)\n"

In [181]:
'''
print('ValueIteration:')
print(mdpResultValue.policy)
print('PolicyIteration.V:')
print(mdpResultValue.V)
print('PolicyIteration.iter:')
print(mdpResultValue.iter)
'''

"\nprint('ValueIteration:')\nprint(mdpResultValue.policy)\nprint('PolicyIteration.V:')\nprint(mdpResultValue.V)\nprint('PolicyIteration.iter:')\nprint(mdpResultValue.iter)\n"

### Greedy approach

In [271]:
def greedy_helper(current_warehouse):
    result = current_warehouse[:4]
    task = current_warehouse[4]
    color = current_warehouse[-1]
    if task =='store':
        for i, field in enumerate(result):
            if field=='empty':
                result[i] = color
                current_warehouse[i]=color
                return current_warehouse, i
    elif task == 'restore':
        for i, field in enumerate(result):
            if field == color:
                result[i] = 'empty'
                current_warehouse[i]='empty'

                return current_warehouse, i
            
    return current_warehouse, -1

In [274]:
def distance_greedy(data_set):
    greedy_warehouse = ['empty', 'empty', 'empty','empty','-','-']
    total_reward = 0
    greedy_warehouse_states=list()
    
    for i, data_row in data_set.iterrows():
        greedy_warehouse[4] = data_row.action
        greedy_warehouse[5] = data_row.color
        
        greedy_warehouse, action = greedy_helper(greedy_warehouse)
        greedy_warehouse_states.append(tuple(greedy_warehouse))
        if action != -1:
            total_reward = total_reward + (distance_costs[action]*2)
    return [total_reward, greedy_warehouse_states]

#### Evaluation on Training data set

In [294]:
distance_greedy(warehouse_order)

[168,
 [('red', 'empty', 'empty', 'empty', 'store', 'red'),
  ('red', 'blue', 'empty', 'empty', 'store', 'blue'),
  ('red', 'blue', 'white', 'empty', 'store', 'white'),
  ('red', 'empty', 'white', 'empty', 'restore', 'blue'),
  ('red', 'empty', 'empty', 'empty', 'restore', 'white'),
  ('red', 'white', 'empty', 'empty', 'store', 'white'),
  ('red', 'white', 'red', 'empty', 'store', 'red'),
  ('red', 'white', 'red', 'white', 'store', 'white'),
  ('red', 'white', 'red', 'white', 'store', 'red'),
  ('empty', 'white', 'red', 'white', 'restore', 'red'),
  ('empty', 'white', 'empty', 'white', 'restore', 'red'),
  ('white', 'white', 'empty', 'white', 'store', 'white'),
  ('white', 'white', 'blue', 'white', 'store', 'blue'),
  ('white', 'white', 'blue', 'white', 'store', 'white'),
  ('white', 'white', 'blue', 'white', 'restore', 'red'),
  ('empty', 'white', 'blue', 'white', 'restore', 'white'),
  ('empty', 'empty', 'blue', 'white', 'restore', 'white'),
  ('red', 'empty', 'blue', 'white', 'store

#### Evaluation on Training data set

In [295]:
distance_greedy(warehouse_training)

[36372,
 [('red', 'empty', 'empty', 'empty', 'store', 'red'),
  ('red', 'blue', 'empty', 'empty', 'store', 'blue'),
  ('red', 'blue', 'white', 'empty', 'store', 'white'),
  ('red', 'empty', 'white', 'empty', 'restore', 'blue'),
  ('red', 'empty', 'empty', 'empty', 'restore', 'white'),
  ('red', 'white', 'empty', 'empty', 'store', 'white'),
  ('red', 'white', 'red', 'empty', 'store', 'red'),
  ('red', 'white', 'red', 'white', 'store', 'white'),
  ('red', 'white', 'red', 'white', 'store', 'red'),
  ('empty', 'white', 'red', 'white', 'restore', 'red'),
  ('empty', 'white', 'empty', 'white', 'restore', 'red'),
  ('white', 'white', 'empty', 'white', 'store', 'white'),
  ('white', 'white', 'blue', 'white', 'store', 'blue'),
  ('white', 'white', 'blue', 'white', 'store', 'white'),
  ('white', 'white', 'blue', 'white', 'restore', 'red'),
  ('empty', 'white', 'blue', 'white', 'restore', 'white'),
  ('empty', 'empty', 'blue', 'white', 'restore', 'white'),
  ('red', 'empty', 'blue', 'white', 'sto

### Test code

In [282]:
def get_new_warehouse(warehouse, pos, action, color):
    if action == 'restore' and warehouse[pos] != 'empty':
        warehouse[pos] = 'empty'
    elif action == 'store' and warehouse[pos] == 'empty':
        warehouse[pos] = color
    return warehouse

In [283]:
def get_state_index(warehouse):
    for i, row in enumerate(state_matrix):
        if list(row) == warehouse:
            return i
        
    return 'failure'

In [284]:
def evaluate_performance(policy, data):
    warehouse = ['empty', 'empty','empty', 'empty','empty']
    next_cell_position = 0
    new_warehouse_state_list = list()
    
    for i, row in data.iterrows():
        warehouse[len(cell_index)] = row.action+"_"+row.color
        state_index = get_state_index(warehouse)
        if state_index == 'failure':
            print('Failure:')
            #print(warehouse, state_index)
            return 'failure'
        else:
            next_cell_position += distance_costs[policy[state_index]] * 2
            warehouse = get_new_warehouse(warehouse, policy[state_index], row.action, row.color)
            #print(warehouse)
            new_warehouse_state_list.append(warehouse)
    return [next_cell_position, new_warehouse_state_list]

#### Evaluation on Training data set

In [290]:
mdp_result_policy_results = evaluate_performance(mdpResultPolicy.policy, warehouse_training)
mdp_result_value_results = evaluate_performance(mdpResultValue.policy, warehouse_training)

In [291]:
print("ValueIteration Robot traveled: ", mdp_result_policy_results[0])
value_iter_states = mdp_result_policy_results[1]
#print(policy_iter_states)
print("PolicyIteration Robot traveled: ", mdp_result_value_results[0])
policy_iter_states = mdp_result_value_results[1]
#print(policy_iter_states)

ValueIteration Robot traveled:  24224
PolicyIteration Robot traveled:  24224


#### Evaluation on Training data set

In [292]:
mdp_result_policy_results = evaluate_performance(mdpResultPolicy.policy, warehouse_order)
mdp_result_value_results = evaluate_performance(mdpResultValue.policy, warehouse_order)

In [293]:
print("ValueIteration Robot traveled: ", mdp_result_policy_results[0])
value_iter_states = mdp_result_policy_results[1]
#print(policy_iter_states)
print("PolicyIteration Robot traveled: ", mdp_result_value_results[0])
policy_iter_states = mdp_result_value_results[1]
#print(policy_iter_states)

ValueIteration Robot traveled:  128
PolicyIteration Robot traveled:  128


## Literature

[1] https://www.youtube.com/watch?v=FgzM3zpZ55o&list=PLoROMvodv4rOSOPzutgyCTapiGlY2Nd8u