In [9]:
import gym
import numpy as np
import time, pickle, os

In [10]:
env = gym.make('FrozenLake-v1')

epsilon = 0.9
total_episodes = 10000
max_steps = 100

lr_rate = 0.81
gamma = 0.96
np.random.seed(0) 
Q = np.zeros((env.observation_space.n, env.action_space.n))

In [3]:
def choose_action(state):
    action=0
    if np.random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
    return action

def learn(state, state2, reward, action):
    predict = Q[state, action]
    target = reward + gamma * np.max(Q[state2, :])
    Q[state, action] = Q[state, action] + lr_rate * (target - predict)

# Start
np.random.seed(0)
for episode in range(total_episodes):
    state = env.reset()
    state = state[0]
    t = 0
    
    while t < max_steps:
        env.render()
        action = choose_action(state)  
        state2,reward,done,_,_ = env.step(action)  
        learn(state, state2, reward, action)

        state = state2

        t += 1
       
        if done:
            break

        # time.sleep(0.1)

print(Q)

with open("frozenLake_qTable.pkl", 'wb') as f:
    pickle.dump(Q, f)

  logger.warn(


[[6.89382234e-01 6.95948954e-01 6.69487331e-01 6.68425889e-01]
 [5.42273760e-01 1.76363725e-02 4.78928989e-01 6.47338376e-01]
 [6.27344830e-01 2.04929739e-01 6.50261965e-01 5.19792256e-01]
 [1.02882524e-01 1.10321522e-01 2.10909228e-02 5.40551832e-01]
 [7.24886772e-01 5.89606221e-01 7.10355329e-01 5.83137896e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.18830930e-01 2.45026126e-02 3.31295037e-03 5.08194657e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.91017914e-01 1.41647847e-01 7.12627947e-01 7.48488400e-01]
 [1.48054036e-01 7.91120281e-01 8.28175333e-01 7.25974949e-01]
 [9.22599912e-01 3.45908133e-02 4.86406299e-01 8.60648576e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.99877292e-01 7.91230072e-01 8.20196687e-01 8.03252281e-01]
 [9.54230886e-01 9.73142225e-01 9.35682217e-01 9.08143681e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000

In [4]:
Q_Normal = Q.copy()

In [5]:
import pandas as pd
class Qlearning():
    
    def __init__(self, actions, learning_rate=0.01, reward_decay=1, e_greedy=0.1):
        self.actions = actions  # a list
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = np.zeros((env.observation_space.n, env.action_space.n))
    
    def choose_action(self,observation):
        # self.check_state_exist(observation)
        
        if np.random.uniform(0,1) < 1 - self.epsilon:
            # state_action = self.q_table[observation,:]
            # action = np.random.choice(state_action[state_action == np.max(state_action)])
            action = np.argmax(self.q_table[observation])
        else:
            action = env.action_space.sample()       
        return action

    def learn(self,observationNow,action,rewards,nextState):
        # self.check_state_exist(nextState)
        
        # print(self.q_table)
        # print(observationNow.id)
        # print(observationNow,action)
        q_predict = self.q_table[observationNow,int(action)]
        q_target = rewards + self.gamma * np.max(self.q_table[nextState,:])
        
        self.q_table[observationNow,int(action)] += self.lr * (q_target - q_predict)
    
    def check_state_exist(self,state):
        
        if state not in self.q_table.index:
            self.q_table =  self.q_table.append(
                pd.Series(
                    [0]*len(self.actions),
                    index = self.q_table.columns,
                    name = state
                )
            )

In [6]:
RL = Qlearning(actions=list(range(4)),learning_rate=lr_rate,reward_decay=gamma)
Q = np.zeros((env.observation_space.n, env.action_space.n))

In [7]:
np.random.seed(0)
for episode in range(total_episodes):
    state = env.reset()
    state = state[0]
    t = 0

    while t < max_steps:
        env.render()

        action = choose_action(state)
        state2,reward,done,_,_ = env.step(action)
        # learn(state, state2, reward, action)
        RL.learn(state,action,reward,state2)

        state = state2

        t += 1
       
        if done:
            break

        # time.sleep(0.1)

print(Q)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [8]:
RL.q_table

array([[0.67428556, 0.68877163, 0.68188873, 0.68352576],
       [0.73207457, 0.76053205, 0.7392512 , 0.77527656],
       [0.79444207, 0.72930745, 0.80732302, 0.74252013],
       [0.58301923, 0.72266176, 0.02442344, 0.68693882],
       [0.71952087, 0.67934634, 0.1257229 , 0.67465131],
       [0.        , 0.        , 0.        , 0.        ],
       [0.86835786, 0.00486486, 0.84713083, 0.66526705],
       [0.        , 0.        , 0.        , 0.        ],
       [0.57863469, 0.00548403, 0.75377524, 0.71238434],
       [0.79710694, 0.83901451, 0.82723873, 0.7475503 ],
       [0.81504071, 0.81765892, 0.15709979, 0.11339934],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.13324377, 0.16872018, 0.86092903, 0.65535539],
       [0.87992346, 0.95584234, 0.96683884, 0.87642294],
       [0.        , 0.        , 0.        , 0.        ]])

In [None]:
if 1 in RL.q_table.index:
    print(RL.q_table.loc[1])