# 1 Understanding MDP's

## 1.1 Chess
State space: 8x8 spaces

action space: 8x8xN<br>
King, Queen, Bishop/Rook: Linear movements in 8 directions * max. 7 steps per direction. <br>
Knight movement: 8 possibilities. <br>
Pawn movements: 3 * Pawn promotions: 3<br>
N = 73 <br>
Policy/reward: Beat the king of the opponent.

## 1.2 LunarLander
State space: S = { coordinate x, coordinate y, linear velocity in x, linear velocity in y, angular velocity, leg one, leg two} <br>
S = {[-90, 90], [-90, 90], [-5, 5], [-5, 5], [-3.1415, 3.1415], [0,1], [0,1]}<br>
Action space: 4 (do nothing, fire left, fire rigfht, fire main)<br>
Policy/reward: Land the spaceship on the landing pad.

## 1.3 Model Based RL
Reward function: Expected reward for r(t+1) given the state and action. <br>
State transition function: Probability of state s' when taking action a in state s. <br>
Two examples: <br>
Discuss: No, they are not generally known. The agent has to learn these functions by taking actions and observing the states and rewards.

# 2 Implementing a GridWorld

## 2.1 Links:
1. https://towardsdatascience.com/reinforcement-learning-implement-grid-world-from-scratch-c5963765ebff
2. https://medium.com/mlearning-ai/applying-reinforcement-learning-algorithms-to-solve-gridworld-problems-29998406dd75
3. https://notebook.community/spro/practical-pytorch/reinforce-gridworld/reinforce-gridworld

In [6]:
import numpy as np

# inspired by: https://github.com/MJeremy2017/reinforcement-learning-implementation/blob/master/GridWorld/gridWorld.py
BOARD_ROWS = 3
BOARD_COLS = 5
# states = 3x5
WIN_STATE = (0, 2)
LOSE_STATE_I = (1, 0)
LOSE_STATE_II = (1, 4)
START = (2, 2)
WALL = (1, 2)
DETERMINISTIC = True

class GridWorld(): 
    def __init__(self, state=START):
        # Initialize GridWorld board
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])

        self.state = state
        self.det = DETERMINISTIC
        
    def giveReward(self):
        if self.state == WIN_STATE:
            return 1
        elif (self.state == LOSE_STATE_I) or (self.state == LOSE_STATE_II):
            return -1
        else:
            return 0

    def nextPosition(self, action):
        """
        action: up, down, left, right
        return: next position
        """
        if self.det:
            if action == 0:
                nextState = (self.state[0] - 1, self.state[1])
            elif action == 1:
                nextState = (self.state[0] + 1, self.state[1])
            elif action == 2:
                nextState = (self.state[0], self.state[1] - 1)
            else: 
                nextState = (self.state[0], self.state[1] + 1)
            # if next state legal aka. if field is free
            if (nextState[0] >= 0) and (nextState[0] <= (BOARD_ROWS -1)):  
                if (nextState[1] >= 0) and (nextState[1] <= (BOARD_COLS -1)):
                    if nextState != WALL:
                        return nextState
            return self.state

    def showBoard(self):
        """Show the GridWorld playfield in ASCII art"""
        self.board[self.state] = 1
        # Wall / barrier
        self.board[WALL] = -1
        # Win / Lose states
        self.board[WIN_STATE] = 2
        self.board[LOSE_STATE_I] = -2
        self.board[LOSE_STATE_II] = -2
        for i in range(0, BOARD_ROWS):
            print('-----------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                # Player
                if self.board[i, j] == 1:
                    token = '*'
                # Wall / barrier
                if self.board[i, j] == -1:
                    token = 'z'
                # Free fields
                if self.board[i, j] == 0:
                    token = '0'
                # Win state
                if self.board[i, j] == 2:
                    token = 'W'
                # Lose state
                if self.board[i, j] == -2:
                    token = 'L'
                out += token + ' | '
            print(out)
        print('-----------------')

GW = GridWorld()

In [11]:
class Agent():
    def __init__(self):
        self.GW = GridWorld()
        self.end = False
        #self.action_list = ["up", "down", "left", "right"]

        self.epsilon = 1
        self.decay = 0.999

        self.returns = {}
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                for k in range(4):
                    self.returns[(i, j), k] = [0,0] 

        self.q_values = np.random.uniform(size=(3,5,4))
    
    # state, action, reward, next_state
    def policy(self, epsilon, state):
        if np.random.rand() > epsilon:
          action = np.argmax(self.q_values[state])
        else:
          action = np.random.randint(0,4)
        return action

    def makeAct(self):
        self.action = self.policy(self.epsilon, self.GW.state)

        self.GW.state = self.GW.nextPosition(self.action)
        self.reward = self.GW.giveReward()
        if self.reward==1 or self.reward==-1:
            # End game
            self.end = True
        #self.q_values[self.GW.state]
    
    def reset(self):
        self.GW = GridWorld()
        self.end = False

    def play(self):
        i = 0
        # Create empty list to safe states where the agent was in this episode
        self.state_action_list = []

        # Create empty dictionary to safe MC-estimates
        self.erg = {}

        while self.end==False:
            last_state = self.GW.state
            self.makeAct()

            self.state_action_list.append((last_state, self.action))
            self.epsilon *= self.decay
            #self.state_list.append(last_state, self.action, self.GW.state)
            i += 1

        # Remove duplicates from list of states
        self.state_ation_list = list(dict.fromkeys(self.state_action_list))

        for s,a in reversed(self.state_action_list):
            self.returns[s, a][0] += self.reward
            self.returns[s, a][1] += 1
        
        for s,a in self.returns:
            try:
                # Safe MC-estimates per k-many visits in state to result dictionary
                self.q_values[s[0],s[1],a] = round(self.returns[s,a][0]/self.returns[s,a][1],3)
            except:
                # Prevent division by 0
                self.q_values[s[0],s[1],a] = 0

        # Useful for Bugfixing
        print('Game end, reward', self.reward)
        print('Turns:',i)
        print("---------------------")
        print(self.q_values)
        print("---------------------")
        self.reset()

    def showValues(self):
        """Show MC-estimates per state aka. field in GridWorld"""
        for i in range(0, BOARD_ROWS):
            print('----------------------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                out += str(self.erg[(i, j)]).ljust(6) + ' | '
            print(out)
        print('----------------------------------------------')

ag = Agent()
ag.play()

Game end, reward -1
Turns: 7
---------------------
[[[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]

 [[-1.  0.  0.  0.]
  [ 0.  0. -1. -1.]
  [-1. -1. -1.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]
---------------------


In [72]:
GW.showBoard()

for k in range(1,10001):
    ag.play()
    # Show MC-estimates per 50, 200, 500, 1000, 10000 
    if k in [50,200,500,1000,10000]:
        print("k =",k)
        ag.showValues()

Game end, reward 1
Turns: 10
---------------------
Game end, reward 1
Turns: 17
---------------------
Game end, reward 1
Turns: 21
---------------------
Game end, reward 1
Turns: 10
---------------------
Game end, reward 1
Turns: 8
---------------------
Game end, reward 1
Turns: 15
---------------------
Game end, reward 1
Turns: 5
---------------------
Game end, reward 1
Turns: 18
---------------------
Game end, reward 1
Turns: 10
---------------------
Game end, reward 1
Turns: 11
---------------------
{(0, 0): 0.5, (0, 1): 1.5, (0, 2): 1.7, (0, 3): 1.2, (0, 4): 0.1, (1, 0): -0.2, (1, 1): 1.1, (1, 2): 0.0, (1, 3): 0.9, (1, 4): -0.2, (2, 0): -0.2, (2, 1): 0.9, (2, 2): 12.2, (2, 3): 0.7, (2, 4): 0.0}
