### Project field
- Use Reinforcement Learning with Q-learning to find solutions to this field.
![Field](img/field-2.png "Field")

In [1]:
#load libraries
import numpy as np
import random

In [32]:
#Create the field

class Field:
    #create a list of values to represent all the states
    def __init__(self):
        self.states = [[-1, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0],
                       [-1, 0, 0, 0, 0, 0, 0, 1, 0, 0 , 0],
                       [0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0]]
        self.state = (random.randrange(0, len(self.states)), random.randrange(0, len(self.states[0])))
        #represent this as a (x, y) coordinate:x and y are the number of rows and columns respectively
     
    #check if current state has non-negative numbers
    def done(self):
        if self.states[self.state[0]][self.state[1]] != 0:
            return True
        else:
            return False
        
    #Set a list to all possible actions, i.e. actions = [0, 1, 2, 3]
    #Then check if state is in a position where a possible actions should be removed.
    #Finally, return the remaining actions
    #actions: 0 -> left
    #actions: 1 -> right
    #actions: 2 -> up
    #actions: 3 -> down
    def get_possible_actions(self):
        actions = [0, 1, 2, 3]
        if self.state[0] == 0 : #if in top row
            actions.remove(2)
        if self.state[0] == len(self.states) - 1: #if in bottom row
            actions.remove(3)
        if self.state[1] == 0: #if in first column
            actions.remove(0)
        if self.state[1] ==len(self.states[0]) - 1: #if in last column
            actions.remove(1)
        return actions
    
    #Get the current state
    #Check if move is illegal, then return current state and -10 in reward
    #Otherwise opdate state and return the reward according to new state
    def update_next_state(self, action):
        x, y = self.state
        if action == 0:
            if y == 0: # if in first column
                return self.state, -10
            self.state = x, y - 1 #else move left
        if action == 1:
            if y == len(self.states[0]) - 1: #if in last column
                return self.state, -10
            self.state = x, y + 1 #else move right
        if action == 2:
            if x == 0: #if in first row
                return self.state, -10
            self.state = x - 1, y #else move up
        if action == 3:
            if x == len(self.states) - 1: #if in last row
                return self.state, -10
            self.state = x + 1, y #else move down
        reward = self.states[self.state[0]][self.state[1]]
        return self.state, reward

In [33]:
field = Field()
field.state, field.done(), field.get_possible_actions()

((0, 3), False, [0, 1, 3])

In [34]:
field.update_next_state(2)
field.state, field.done(), field.get_possible_actions()

((0, 3), False, [0, 1, 3])

In [37]:
#Train the model

field = Field()
#create a q_table initialized to all 0
q_table = np.zeros((len(field.states), len(field.states[0]), 4))

alpha = .5
gamma = .5
epsilon = .5

for _ in range(10000):
    field = Field() #create a field
    while not field.done(): #while not done
        actions = field.get_possible_actions() #get the possible actions
        #With probability epsilon take a random action, otherwise take the best action
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions) #random action
        else:
            action = np.argmax(q_table[field.state]) #best action
            
        cur_x, cur_y = field.state #get current state
        (next_x, next_y), reward = field.update_next_state(action) #Update next state and get it and the reward
        
        q_table[cur_x, cur_y, action] = (1 - alpha)*q_table[cur_x, cur_y, action] + alpha*(reward + gamma*np.max(q_table[next_x, next_y]))

In [39]:
#Solve the task
path = np.zeros((3, 11))
field = Field() #craete a field
steps = 1 #to count the steps
path[field.state[0]][field.state[1]] = np.nan #Assign the start state in the path to np.nan

while not field.done(): #while not solved
    action = np.argmax(q_table[field.state]) #get the action to take
    
    (next_x, next_y), _ = field.update_next_state(action) #get the next state
    path[next_x][next_y] = steps #update path with steps
    
    steps += 1 #increment steps by 1

In [40]:
#see the path
path

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  1., nan,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [None]:
#The training phase could just take random actions
#This could be possible if we set 'epsilon' to 1