In [1]:
import numpy as np
import gym, random, pickle

In [2]:
class mcar_env:
    def __init__(self):
        self.env = gym.make('MountainCar-v0')
        self.env._max_episode_steps = 500
        self.pos_space = np.linspace(-1.2,0.6, 19)
        self.vel_space = np.linspace(-0.07, 0.07, 19)
        
        self.actions = [0,1,2]
        self.states = [ (i,j) for i in range(20) for j in range(20)]
        
    def get_agent_state(self, obs):
        pos, vel = obs
        pos_num = np.digitize(pos, self.pos_space)
        vel_num = np.digitize(vel, self.vel_space)
        agent_state = (pos_num, vel_num)
        return agent_state
    
    def reset(self):
        obs = self.env.reset()
        return self.get_agent_state(obs)
    
    def render(self):
        return self.env.render()
    
    def step(self, action):
        sp, r, d, info = self.env.step(action)
        return self.get_agent_state(sp), r, d, info
    
    def close(self):
        return self.env.close()
    
    
    
        



In [6]:
new_env = mcar_env()

In [4]:
d = False
new_env.reset()
total_r = 0
while not d:
    a = random.randint(0,2)
    _,r,d,_ = new_env.step(a)
    new_env.render()
    total_r += r
new_env.close()
print(total_r)
    

-500.0


In [3]:
class Qagent:
    def __init__(self, env):
        
        self.env     = env
        self.actions = self.env.actions
        self.states = self.env.states
        self.Q = {}
        for state in self.states:
            for a in self.actions:
                self.Q[(state, a)] = 0
        
    def action(self, obs, ep=0):
        coin = np.random.rand()
        if coin < ep:
            return np.random.choice(self.actions)
        else:
            Qvalues =[self.Q[(obs, j)] for j in self.actions]
            return np.argmax(Qvalues)
    
    def test(self, test_num, render_op = False):
        
        aver_r = 0
        
        for j in range(test_num):
            total_r = 0
            done = False
            s    = self.env.reset()
        
            while not done:
                if render_op:
                    self.env.render()
                a = self.action(s)
                s, r, done, _ = self.env.step(a)
                total_r += r
                aver_r  += r
            
            if render_op:
                self.env.render()
                print("{}-th episode reward : {}".format(j+1, total_r))
            total_r = 0
        
        self.env.close()
        return aver_r / test_num
    
    def saveQ(self, name):
        f = open(name +".pkl", "wb")
        pickle.dump(self.Q, f)
        f.close()
        print("Q-values are saved")
            
    def loadQ(self, name):
        with open( name +'.pkl', 'rb') as f:
            self.Q = pickle.load(f)
        print("Q-values aer loaded")
        


    

In [6]:
Agent = Qagent(new_env)

In [7]:
Agent.test(3)

-500.0

In [7]:
class trainer:
    def __init__(self,agent, lr, gamma, init_ep, min_ep, ep_decay):
        self.agent = agent
        self.lr    = lr
        self.gamma = gamma
        self.ep = init_ep
        self.min_ep  = min_ep
        self.ep_decay = ep_decay
        
    def learning(self, num_training):
        
        reward_history = []
        test_history = []
        
        for j in range(num_training):
            
            done = False
            total_r = 0
            s = self.agent.env.reset()
                
            if self.ep > self.min_ep and (j+1) % self.ep_decay == 0:
                self.ep -= 0.1
            
            
            while not done:
                a = self.agent.action(s, self.ep)
                sp, r, done, _ = self.agent.env.step(a)
                total_r += r
                next_q_values = [self.agent.Q[(sp,jj)] for jj in self.agent.actions]
                next_q_max = np.max(next_q_values)
                self.agent.Q[(s,a)] +=self.lr *( r + (1-int(done))*self.gamma *next_q_max - self.agent.Q[(s,a)])
                s = sp
                
            reward_history.append(total_r)
            if (j+1)%self.ep_decay == 0:
                avr = self.agent.test(10)
                print("Trained episode so far :{}  average test reward {}".format(j+1, avr))
                test_history.append(avr)
                
            
                
        self.agent.env.close()
        pass

In [8]:
Agent = Qagent(new_env)
tt = trainer(agent=Agent, lr=0.05, gamma=0.99,
             init_ep=1.0, min_ep=0.1, ep_decay=200)

In [10]:
tt.learning(2600)

Trained episode so far :200  average test reward -500.0
Trained episode so far :400  average test reward -500.0
Trained episode so far :600  average test reward -136.3
Trained episode so far :800  average test reward -176.8
Trained episode so far :1000  average test reward -189.5
Trained episode so far :1200  average test reward -152.9
Trained episode so far :1400  average test reward -196.9
Trained episode so far :1600  average test reward -187.0
Trained episode so far :1800  average test reward -188.9
Trained episode so far :2000  average test reward -279.0
Trained episode so far :2200  average test reward -269.3
Trained episode so far :2400  average test reward -172.7
Trained episode so far :2600  average test reward -214.2


In [11]:
Agent.test(5, True)

1-th episode reward : -209.0
2-th episode reward : -214.0
3-th episode reward : -209.0
4-th episode reward : -248.0
5-th episode reward : -220.0


-220.0

In [None]:
#Agent.saveQ('mcar')

In [9]:
Agent.loadQ('mcar')

Q-values aer loaded


In [10]:
Agent.test(5, True)

1-th episode reward : -157.0
2-th episode reward : -156.0
3-th episode reward : -184.0
4-th episode reward : -177.0
5-th episode reward : -173.0


-169.4