# Value Iteration in a grid world

### Import necessary libraries

In [22]:
import numpy as np
import json, operator

### Set default parameters and initialize variables

In [32]:
size_x = 4
size_y = 3
step_cost = -0.04
gamma = 0.5
prob_success = 0.8
prob_fail = (1 - prob_success) / 2
convergence = 0.0001
reward_delta = convergence * 2

world_print = [[0 for x in range(size_x)] for y in range(size_y)] 
world_access = [[0 for x in range(size_x)] for y in range(size_y)]  
world_reward = [[0 for x in range(size_x)] for y in range(size_y)]

### Helpfunctions for printing the world

In [24]:
def printRow(arr,y,length):
    print("|",end="")
    for elem in arr[y]:
        if(type(elem)==type('X')):
            print(str(elem).center(length),end="")
        else:
            print(str(format(elem,".4f")).center(length),end="")
        print("|",end="")
    print(end="\n")
    

def printSpace(width, length):
    print("+"+str("").center(length,"-")+"+",end="")
    for w in range(width-2):
        print(str("").center(length,"-")+"+",end="")
    print(str("").center(length,"-")+"+")
    
    
def printWorld(world):
    i=0
    for h in range(len(world)*2+1):
        if(h%2==1):
            printRow(world,i,len(world[0])*2)
            i+=1
        else:
            printSpace(len(world[0]), len(world[0])*2)

### Import grid world information from JSON file

In [25]:
def readJson():
    with open('world.json') as json_data:
        d = json.load(json_data)
        print("JSON file content")
        print(d)
        return d
       
        
json_data = readJson() 

JSON file content
{'world': [{'row': [{'cell': ['_', 0, 0]}, {'cell': ['_', 0, 0]}, {'cell': ['_', 0, 0]}, {'cell': ['G', 1, 1]}]}, {'row': [{'cell': ['_', 0, 0]}, {'cell': ['#', -1, 0]}, {'cell': ['_', 0, 0]}, {'cell': ['T', 1, -1]}]}, {'row': [{'cell': ['_', 0, 0]}, {'cell': ['_', 0, 0]}, {'cell': ['_', 0, 0]}, {'cell': ['_', 0, 0]}]}]}


### Distribute grid world information to corresponding variables

In [33]:
def initializeWorld(json_data):
    global world_print
    global world_access
    global world_reward
    i = 0
    for row in json_data['world']:
        j = 0
        for cell in row['row']:
            world_print[i][j]= cell['cell'][0]
            world_access[i][j]= cell['cell'][1]
            world_reward[i][j]= cell['cell'][2]
            j=j+1
        i=i+1
        

initializeWorld(json_data)
print("World")    
printWorld(world_print)
print("Accessibility")
printWorld(world_access)
print("Default rewards")
printWorld(world_reward)

World
+--------+--------+--------+--------+
|   _    |   _    |   _    |   G    |
+--------+--------+--------+--------+
|   _    |   #    |   _    |   T    |
+--------+--------+--------+--------+
|   _    |   _    |   _    |   _    |
+--------+--------+--------+--------+
Accessibility
+--------+--------+--------+--------+
|  0.00  |  0.00  |  0.00  |  1.00  |
+--------+--------+--------+--------+
|  0.00  | -1.00  |  0.00  |  1.00  |
+--------+--------+--------+--------+
|  0.00  |  0.00  |  0.00  |  0.00  |
+--------+--------+--------+--------+
Default rewards
+--------+--------+--------+--------+
|  0.00  |  0.00  |  0.00  |  1.00  |
+--------+--------+--------+--------+
|  0.00  |  0.00  |  0.00  | -1.00  |
+--------+--------+--------+--------+
|  0.00  |  0.00  |  0.00  |  0.00  |
+--------+--------+--------+--------+


### Calculation methods for all possible steps

In [27]:
def up(y_coord, x_coord):
    global world_access
    global world_reward
    
    if(y_coord == 0 or world_access[y_coord - 1][x_coord] < 0):
        return world_reward[y_coord][x_coord]
    return world_reward[y_coord -1][x_coord]
    
def down(y_coord, x_coord):
    global world_access
    global world_reward
    
    if(y_coord == size_y - 1 or world_access[y_coord + 1][x_coord] < 0):
        return world_reward[y_coord][x_coord]
    return world_reward[y_coord + 1][x_coord]


def left(y_coord, x_coord):
    global world_access
    global world_reward
    
    if(x_coord == 0 or world_access[y_coord][x_coord - 1] < 0):
        return world_reward[y_coord][x_coord]
    return world_reward[y_coord][x_coord - 1]


def right(y_coord, x_coord):
    global world_access
    global world_reward
    
    if(x_coord == size_x - 1 or world_access[y_coord ][x_coord + 1] < 0):
        return world_reward[y_coord][x_coord]
    return world_reward[y_coord][x_coord + 1]

### Helpfunction to find the lowest value

In [28]:
def getMinReward():
    global world_reward
    
    rewards = []
    for row in world_reward:
        rewards.append(min(row))
    return min(rewards)

### Calculate one iteration step

In [29]:
def calcStep():
    global world_print
    global world_access
    global world_reward
    global reward_delta
    
    world_reward_new = []
    world_reward_new = world_reward
    rew={}
    reward_delta = 0
    for i in range(len(world_reward_new)):
        for j in range(len(world_reward_new[i])):
            if(world_access[i][j] == 0):
                rew["^"] = prob_success * up(i,j) + prob_fail * left(i,j) + prob_fail * right(i,j)
                rew["<"] = prob_success * left(i,j) + prob_fail * up(i,j) + prob_fail * down(i,j)
                rew["v"] = prob_success * down(i,j) + prob_fail * left(i,j) + prob_fail * right(i,j)
                rew[">"] = prob_success * right(i,j) + prob_fail * up(i,j) + prob_fail * down(i,j)
                print_symbol, max_value = max(rew.items(), key=operator.itemgetter(1))
                world_print[i][j] = print_symbol
                delta_temp = abs(world_reward_new[i][j] - (step_cost + gamma * max_value))
                if(reward_delta < delta_temp):
                    reward_delta = delta_temp
                world_reward_new[i][j] = step_cost + gamma * max_value
    world_reward = world_reward_new

### Iterate until a suiting policy is found

In [30]:
def valueIteration():
    global world_print
    global world_reward
    global convergence
    global reward_delta
    
    iteration=1
    while(convergence < reward_delta):
        calcStep()
        print("\nIteration " + str(iteration))
        printWorld(world_reward)
        print("Größte Belohnungsänderung: " + str(reward_delta)) 
        iteration = iteration + 1
    print("\nGefundene Policy")
    printWorld(world_print)

In [34]:
valueIteration()


Iteration 1
+--------+--------+--------+--------+
| -0.04  | -0.04  |  0.36  |  1.00  |
+--------+--------+--------+--------+
| -0.04  |  0.00  |  0.05  | -1.00  |
+--------+--------+--------+--------+
| -0.04  | -0.04  | -0.02  | -0.04  |
+--------+--------+--------+--------+
Größte Belohnungsänderung: 0.36000000000000004

Iteration 2
+--------+--------+--------+--------+
| -0.06  |  0.10  |  0.38  |  1.00  |
+--------+--------+--------+--------+
| -0.06  |  0.00  |  0.06  | -1.00  |
+--------+--------+--------+--------+
| -0.06  | -0.05  | -0.02  | -0.06  |
+--------+--------+--------+--------+
Größte Belohnungsänderung: 0.14

Iteration 3
+--------+--------+--------+--------+
| -0.01  |  0.12  |  0.38  |  1.00  |
+--------+--------+--------+--------+
| -0.05  |  0.00  |  0.07  | -1.00  |
+--------+--------+--------+--------+
| -0.06  | -0.05  | -0.02  | -0.07  |
+--------+--------+--------+--------+
Größte Belohnungsänderung: 0.054

Iteration 4
+--------+--------+--------+--------+
