# Environment

In [2]:
!pip install gym

Collecting gym
  Downloading gym-0.21.0.tar.gz (1.5 MB)
Building wheels for collected packages: gym
  Building wheel for gym (setup.py): started
  Building wheel for gym (setup.py): finished with status 'done'
  Created wheel for gym: filename=gym-0.21.0-py3-none-any.whl size=1616831 sha256=cfffa59e863991762d292069beb5c672625f3df8da3121fbee505736512e1520
  Stored in directory: c:\users\acer\appdata\local\pip\cache\wheels\27\6d\b3\a3a6e10704795c9b9000f1ab2dc480dfe7bed42f5972806e73
Successfully built gym
Installing collected packages: gym
Successfully installed gym-0.21.0


In [1]:
import numpy as np
import copy
from gym import Env
import datetime

class FrozenLake(Env):
    def __init__(self,studentNum:int=256, nonStationary = False):
        self.studentNum = studentNum
        self.nonStationary = nonStationary
        
        np.random.seed(self.studentNum)
        self.beginMap = make_map(self.studentNum) #*2
        self.beginMap[self.beginMap>1] = 1
        self.endMap = make_map(self.studentNum + 100)
        
        self.changeDir = self.endMap - self.beginMap
        self.changeDir *= 1/11000

        self.fixedMap = self.beginMap

        np.random.seed(datetime.datetime.now().microsecond)
        
        self.map = copy.deepcopy(self.fixedMap)
        self.time = 0
        self.reset()

    def reset(self):
        self.NSreset()
        if not self.nonStationary:
            self.map = copy.deepcopy(self.fixedMap)
            self.time = 0

        return self.state

    def NSreset(self):
        self.time += 1
        self.map += self.changeDir

        self.map[self.map>0.95]=0.95
        self.map[self.map<0.0]=0.0

        self.state = (0,0)
        self.done = False
        return self.state
    
    def states_transitions(self, state, action):
        x = state[0]
        y = state[1]
        states = np.array([[x,y-1], [x,y+1], [x-1 ,y], [x+1,y] ])


        if action == UP:
            selected = states[2]
        if action == DOWN:
            selected = states[3]
        if action == RIGHT:
            selected = states[1]
        if action == LEFT:
            selected = states[0]

        zero = np.zeros((4,2)).astype(int)
        three = (3 * np.ones((4,2))).astype(int)
        output = np.maximum(np.minimum(states, three),zero)
        output, indices = np.unique(output, axis = 0, return_counts= True)

        
        selected = np.maximum(np.minimum(selected, three[0]), zero[0])
        probs = indices * 0.025
        probs[np.argmax(np.sum(selected == output, axis = 1))] += 0.9

        return list(zip(output[:,0],output[:,1])), probs
    
    def possible_consequences(self,action:int,state_now=None):

        if state_now==None:
            state_now = self.state
        state = [state_now[0],state_now[1]]
        states, probs = self.states_transitions(state, action)
        aa = np.array(states) 
        fail_probs = self.map[(aa[:,0]),(aa[:,1])]
        dones = np.sum(aa == 3, axis = 1) == 2
        return states, probs, fail_probs,dones
    
    def step(self, a:int):
        if not (a in range(4)):
            raise Exception("action is not available!!!")
        
        states, probs, fail_probs,dones = self.possible_consequences(a)
        
        next_idx = np.random.choice(np.arange(len(states)), p = probs)
        next_state = states[next_idx]
        self.state = tuple(next_state)
        
        self.done = dones[next_idx]

        r = -1

        if self.done:
            r += 60
        elif np.random.rand()< fail_probs[next_idx]:
            r -= 15
            self.done = True

        return (self.state, r, self.done, {})

    def render(self,state=None):
        if state == None:
            state = self.state

        out = ""
        for i in range(4):
            out += "\n------------------------------\n| "
            for j in range(4):
                if (i,j) == state:
                    out += "\033[44m{:.3f}\033[0m | ".format(self.map[i,j])
                else :
                    out += "{:.3f} | ".format(self.map[i,j])

        out += "\n------------------------------"
        print(out)

    def environment_states(self):
        env_states = []
        for state_index in range(16):
            s0 = state_index % 4
            s1 = state_index//4
            env_states.append((s0,s1))
        return env_states

        
def set_max_min(var,maximum,minimum):
    return min(max(var,minimum),maximum)

def make_map(studentNum):
    np.random.seed(studentNum)  
    move = np.zeros(6)
    idx = np.random.choice(range(6),size=3,replace=False)
    move[idx] = 1
    point = [0,0]
    lowprobs = [tuple(point)]

    for m in move:
        if m:
            point[0] += 1
        else:
            point[1] += 1
        lowprobs.append(tuple(point))
    
    map = np.random.rand(4,4)
    idx = np.array(lowprobs)

    map[idx[:,0],idx[:,1]] = 0.001 
    map[0,0] = 0.0
    map[3,3] = 0.0 

    return map

## Your Student ID

In [2]:
STUDENT_NUM = 400722156

# HyperParameters

In [3]:
#%% allowed actions
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

ACTIONS = [LEFT,DOWN,RIGHT,UP]

#%% hyperparameters
EPISODES = 10000
EPSILON = 0.1
LEARNING_RATE = 0.1
DISCOUNT = 0.9

## Map of environment

In [4]:
environment = FrozenLake(studentNum=STUDENT_NUM)

print("Environment with fail probabilities :")
environment.render()


Environment with fail probabilities :

------------------------------
| [44m0.000[0m | 0.808 | 0.427 | 0.546 | 
------------------------------
| 0.001 | 0.684 | 0.899 | 0.870 | 
------------------------------
| 0.001 | 0.001 | 0.001 | 0.259 | 
------------------------------
| 0.872 | 0.043 | 0.001 | 0.000 | 
------------------------------


## <h2><font color=indigo> Agent Implementation
Implement your q-learning (off-policy TD) agent here. You need to utilize the step function provided in the Environment class to interact with frozen lake environment.

In [7]:
import sys
import itertools
import random

class Q_Learning:
    def __init__(self, id, environment, discount , learning_rate = 0.1 , epsilon = 0.1 ,episodes=10000):
        self.id = 400722156
        np.random.seed(self.id)
        self.environment = environment
        self.discount = discount
        self.episodes = episodes
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.n_actions = 4
        
        
        self.n_states = len(environment.environment_states())
        self.Q = np.zeros((self.n_states, self.n_actions))
        
        self.state_to_number = self.environment.environment_states()

    def choose_action(self, state):
        # we e-greedy approach for our action choice
       
        if np.random.random() < self.epsilon:
            action = np.random.choice([i for i in range(self.n_actions)])
        else:
            state = self.state_to_number.index(state)
            actions = np.array([self.Q[(state, a)] for a in range(self.n_actions)])
            action = np.argmax(actions)

        return action


    def learn(self, state, action, reward, new_state):
        state = self.state_to_number.index(state)
        new_state = self.state_to_number.index(new_state)
        
        actions = np.array([self.Q[(new_state, a)] for a in range(self.n_actions)])
        
        # the q-learning equation, calculating state, action values
        self.Q[(state, action)] += self.learning_rate * (reward + \
                                    self.discount * \
                                    self.Q[(new_state, np.argmax(actions))] - self.Q[(state, action)])


    def run(self):
        
        for i in range(self.episodes):
            
            state = self.environment.reset()
            done = False

            while not done:

                action = self.choose_action(state)
                new_state, reward, done, _ = self.environment.step(action)
                self.learn(state, action, reward, new_state)
                state = new_state
                
            if i % 5000 == 0:
                print("episode ", i)

        print("finished")
        print()
        print(self.Q)
        
        return self.Q, _

    def policy(self, state):
        state = self.state_to_number.index(state)
        actions = np.array([self.Q[(state, a)] for a in range(self.n_actions)])
        return np.argmax(actions)
    
    def find_best_policy(self):
        
        state_sequence = []
        action_sequence = []

        state = self.environment.reset()
        done = False
        for i in range(10):
            state_sequence.append(state)
            action = self.policy(state)
            action_sequence.append(ACTIONS[action])
            next_state, reward, done, _ = self.environment.step(action)
            state = next_state
            
            self.environment.render(state)
            if done:
                break
        return state_sequence, action_sequence

## <h2><font color=indigo> Q Values
Return the Q values that your agent learns in here:

In [8]:
agent = Q_Learning('Ghazaleh_Mahmoudi', environment, 0.9 , learning_rate = 0.5 , epsilon = 0.1 ,episodes= 10000)
Q , policy = agent.run()

episode  0
episode  5000
finished

[[ 22.54933906  26.74276804  16.40091051  21.39933627]
 [ 24.49798767  30.85131586  25.15554469  23.42738719]
 [ 29.03512516  25.20621412  30.2705394   25.98695389]
 [ 21.35443003  22.62576451  45.16810099  28.19256763]
 [  1.89165374  30.05449433  -3.93478053   3.98112647]
 [ 28.75918169  32.53626122  12.92851318   9.36433128]
 [ 29.56294134  44.01460407  33.30128771  20.9805798 ]
 [ 26.88379903  36.4746089   51.87570039  38.27085577]
 [ -8.          -6.43366301  -8.25        -0.5       ]
 [ -0.37808241  41.44991919  -4.65043702   0.        ]
 [ 37.83753954  27.80989139  27.00511458  21.30523034]
 [ 44.70754012  45.94945405  58.9999999   31.60475767]
 [ -7.2580509    0.           0.           0.        ]
 [  7.44347329   0.           0.           0.        ]
 [ 17.11842416  21.09956298  21.91684221 -15.9921875 ]
 [  0.           0.           0.           0.        ]]


## <h2><font color=darkcyan> Policy
Return the optimal policy that your agent learns in here:

In [13]:
state_sequence, action_sequence = agent.find_best_policy()


------------------------------
| 0.000 | 0.808 | 0.427 | 0.546 | 
------------------------------
| [44m0.001[0m | 0.684 | 0.899 | 0.870 | 
------------------------------
| 0.001 | 0.001 | 0.001 | 0.259 | 
------------------------------
| 0.872 | 0.043 | 0.001 | 0.000 | 
------------------------------

------------------------------
| 0.000 | 0.808 | 0.427 | 0.546 | 
------------------------------
| 0.001 | 0.684 | 0.899 | 0.870 | 
------------------------------
| [44m0.001[0m | 0.001 | 0.001 | 0.259 | 
------------------------------
| 0.872 | 0.043 | 0.001 | 0.000 | 
------------------------------

------------------------------
| 0.000 | 0.808 | 0.427 | 0.546 | 
------------------------------
| 0.001 | 0.684 | 0.899 | 0.870 | 
------------------------------
| 0.001 | [44m0.001[0m | 0.001 | 0.259 | 
------------------------------
| 0.872 | 0.043 | 0.001 | 0.000 | 
------------------------------

------------------------------
| 0.000 | 0.808 | 0.427 | 0.546 | 
------------------

### step sequence to reach the goal

In [8]:
state_sequence

[(0, 0), (1, 0), (2, 0), (2, 1), (2, 2), (3, 2)]

In [9]:
action_translation = {0: "LEFT", 1: "DOWN", 2: "RIGHT", 3: "UP"}

### action sequence to reach the goal

In [14]:
for i in range(len(state_sequence)):
    print("current state: ", state_sequence[i], " action: ", action_translation[action_sequence[i]])

current state:  (0, 0)  action:  DOWN
current state:  (1, 0)  action:  DOWN
current state:  (2, 0)  action:  RIGHT
current state:  (2, 1)  action:  DOWN
current state:  (2, 2)  action:  RIGHT
current state:  (2, 3)  action:  DOWN
