In [None]:
import numpy as np
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os

In [None]:
class ReplayBuffer():
    def __init__(self, max_size, ip_dims, n_acts):
        
        self.max_size = max_size
        self.mean_cntr = 0 
        
        self.state_memory = np.zeros((self.max_size, *ip_dims))
        self.new_state_memory = np.zeros((self.max_size, *ip_dims))
        self.action_memory = np.zeros((self.max_size, n_acts))
        self.reward_memory = np.zeros(self.max_size)
        
        self.terminal_mmeory = np.zeros(self.max_size, dtype=np.bool)
        
    def store_transition(self, state, action, reward, state_, done):
        
        index = self.mean_cntr % self.max_size
        
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.terminal_mmeory[index] = done
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        
        
        
        self.mean_cntr += 1
        
    
    def sample_buffer(self, batch_size):
        max_mem = min(self.mean_cntr, self.max_size)
        batch = np.random.choice(max_mem, batch_size)
        
        state = self.state_memory[batch]
        state_ = self.new_state_memory[batch]
        action = self.action_memory[batch]
        reward = self.reward_memory[batch]
        dones = self.terminal_mmeory[batch]
        
        return state, action, reward, state_, dones

class Critic(nn.Module):
    def __init__(self, beta, input_dims, n_actions,name):
        super(Critic, self).__init__()
                
        self.ip_dims = input_dims
        self.name = name
        self.n_acts = n_actions
        
        self.fc1 = nn.Linear(self.ip_dims[0] + n_actions, 400)
        self.fc2 = nn.Linear(400, 300)
        self.q1 = nn.Linear(300, 1)
        
        self.optimizer = optim.Adam(self.parameters(), lr=beta)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        
        self.to(self.device)
        
    
    def forward(self, state, action):
        Q = self.fc1(T.cat([state, action], dim=1))
        Q = F.relu(Q)
        Q = self.fc2(Q)
        Q = F.relu(Q)
        Q = self.q1(Q)
        
        return Q
    
        

class Actor(nn.Module):
    def __init__(self, alpha, input_dims, n_actions, name):
        super(Actor, self).__init__()
        
        self.ip_dims = input_dims
        self.name = name
        self.n_acts = n_actions
        
        self.fc1 = nn.Linear(*self.ip_dims, 400)
        self.fc2 = nn.Linear(400, 300)
        self.mu = nn.Linear(300, self.n_acts)
        
        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        
        self.to(self.device)
        
    
    def forward(self, state):
        p = self.fc1(state)
        p = F.relu(p)
        p = self.fc2(p)
        p = F.relu(p)
        mu = T.tanh(self.mu(p))
        
        return mu



In [None]:
class Agent():
    def __init__(self, alpha, beta, ip_dims, tau, env,
            gamma=0.99, update_actor_interval=2, warmup=1000,
            n_actions=2, max_size=1000000, layer1_size=400,
            layer2_size=300, batch_size=100, noise=0.1):
        
        self.gamma = gamma
        self.tau = tau
        
        self.max_action = env.action_space.high
        self.min_action = env.action_space.low
        
        self.memory = ReplayBuffer(max_size, ip_dims, n_actions)
        self.batch_size =batch_size
        self.learning_step_ctr = 0
        self.time_step = 0
        self.n_actions = n_actions
        
        self.actor_itr = update_actor_interval
        self.warmup = warmup
        
        self.actor = Actor(alpha, ip_dims, n_actions=n_actions, name='actor')
        self.critic_1 = Critic(beta, ip_dims, n_actions=n_actions, name='critic_1')
        self.critic_2 = Critic(beta, ip_dims, n_actions=n_actions, name='critic_2')
        
        self.target_actor = Actor(alpha, ip_dims, n_actions=n_actions, name='target_actor')
        self.target_critic_1 = Critic(beta, ip_dims, n_actions=n_actions, name='target_critic_1')
        self.target_critic_2 = Critic(beta, ip_dims, n_actions=n_actions, name='target_critic_2')
        
        self.noise = noise
        
        self.update_network(tau=1)
        
    
    def choose_action(self, observation):
        
        if self.time_step < self.warmup:
            mu = T.tensor(np.random.normal(scale=self.noise, size=(self.n_actions,)))
            
        else: 
            state = T.tensor(observation, dtype=T.float).to(self.actor.device)
            mu = self.actor.forward(state).to(self.actor.device)
            
        mu_prime = mu + T.tensor(np.random.normal(scale=self.noise), dtype=T.float).to(self.actor.device)
        
        mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0])
        
        self.time_step += 1
        
        return mu_prime.cpu().detach().numpy()
    
    def remember(self, state,action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)
    
    def learn(self):
        if self.memory.mean_cntr < self.batch_size:
            return 
        state, action, reward, new_state, done = \
                self.memory.sample_buffer(self.batch_size)

        reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device)
        done = T.tensor(done).to(self.critic_1.device)
        state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device)
        state = T.tensor(state, dtype=T.float).to(self.critic_1.device)
        action = T.tensor(action, dtype=T.float).to(self.critic_1.device)
        
        target_actions = self.target_actor.forward(state_)
        target_actions = target_actions + \
                T.clamp(T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5)
        target_actions = T.clamp(target_actions, self.min_action[0], 
                                self.max_action[0])
        
        q1_ = self.target_critic_1.forward(state_, target_actions)
        q2_ = self.target_critic_2.forward(state_, target_actions)

        q1 = self.critic_1.forward(state, action)
        q2 = self.critic_2.forward(state, action)

        q1_[done] = 0.0
        q2_[done] = 0.0

        q1_ = q1_.view(-1)
        q2_ = q2_.view(-1)

        critic_value_ = T.min(q1_, q2_)

        target = reward + self.gamma*critic_value_
        target = target.view(self.batch_size, 1)

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()

        q1_loss = F.mse_loss(target, q1)
        q2_loss = F.mse_loss(target, q2)
        critic_loss = q1_loss + q2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.learning_step_ctr += 1

        if self.learning_step_ctr % self.actor_itr != 0:
            return

        self.actor.optimizer.zero_grad()
        actor_q1_loss = self.critic_1.forward(state, self.actor.forward(state))
        actor_loss = -T.mean(actor_q1_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network()

    def update_network(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_1_params = self.critic_1.named_parameters()
        critic_2_params = self.critic_2.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_1_params = self.target_critic_1.named_parameters()
        target_critic_2_params = self.target_critic_2.named_parameters()

        critic_1 = dict(critic_1_params)
        critic_2 = dict(critic_2_params)
        actor = dict(actor_params)
        target_actor = dict(target_actor_params)
        target_critic_1 = dict(target_critic_1_params)
        target_critic_2 = dict(target_critic_2_params)

        for name in critic_1:
            critic_1[name] = tau*critic_1[name].clone() + \
                    (1-tau)*target_critic_1[name].clone()

        for name in critic_2:
            critic_2[name] = tau*critic_2[name].clone() + \
                    (1-tau)*target_critic_2[name].clone()

        for name in actor:
            actor[name] = tau*actor[name].clone() + \
                    (1-tau)*target_actor[name].clone()

        self.target_critic_1.load_state_dict(critic_1)
        self.target_critic_2.load_state_dict(critic_2)
        self.target_actor.load_state_dict(actor)

In [None]:
import gym 
import numpy as np
def plotLearning(scores, filename, x=None, window=5):
    N = len(scores)
    running_avg = np.empty(N)
    for t in range(N):
        running_avg[t] = np.mean(scores[max(0, t-window):(t+1)])
    if x is None:
        x = [i for i in range(N)]
    plt.ylabel('Score')       
    plt.xlabel('Game')                     
    plt.plot(x, running_avg)
    plt.savefig(filename)
    
if __name__ == '__main__':
    env = gym.make('LunarLanderContinuous-v2')
    
    agent = Agent(alpha=0.001, beta=0.001,
            ip_dims=env.observation_space.shape, tau=0.005,
            env=env, batch_size=100, layer1_size=400, layer2_size=300,
            n_actions=env.action_space.shape[0])
    
    
    n_games = 1000
    file_name = 'plots/'+'LunarLunderContinuous_'+' '+str(n_games)+'_games.png'
    
    best_score= env.reward_range[0]
    
    score_hist = []
    
    #agent.load_model()
    for i in range(n_games):
        
        observation = env.reset()
        done = False
        score = 0
        
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            agent.remember(observation, action, reward, observation_, done)
            agent.learn()
            
            score += reward
            
            observation = observation_
            
        score_hist.append(score)
        
        avg_score = np.mean(score_hist[-100:])
        
        if avg_score > best_score:
            best_score = avg_score
            #agent.save_model()
        print('episode:', i, 'score: %.2f' %score, 'avg_score: %.2f' %avg_score)
    plotLearning(score_hist, filename=filename, window=50)