#### Create the env without installation and any error: 
- `conda create -n DS_HW4_env -c hcc -c conda-forge python=3.6 gym pybox2d`

In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from PIL import Image
from torch.distributions import Categorical

In [2]:
class ActorCritic(nn.Module):
    
    def __init__(self, action_space, observation_space):
        
        super(ActorCritic, self).__init__()
        
        self.affine = nn.Linear(observation_space, 128)
        self.action_layer = nn.Linear(128, action_space)
        self.value_layer = nn.Linear(128, 1)
        
        self.log_probs = list()
        self.state_values = list()
        self.rewards = list()

    def forward(self, state):
        
        state = torch.from_numpy(state).float()
        state = self.affine(state)
        state = F.relu(state)
        
        state_value = self.value_layer(state)
        
        action_probs = self.action_layer(state)
        action_probs = F.softmax(action_probs)
        action_probs = Categorical(action_probs)
        action = action_probs.sample()
        
        self.log_probs.append(action_probs.log_prob(action))
        self.state_values.append(state_value)
        
        return action.item()
    
    def calculate_loss(self, gamma):
        
        rewards = list()
        discounted_reward = 0
        for reward in self.rewards[::-1]:
            discounted_reward = reward + gamma * discounted_reward
            rewards.insert(0, discounted_reward)
                
        # normalize rewards
        rewards = torch.tensor(rewards, dtype=torch.float32)
        rewards = (rewards - rewards.mean()) / (rewards.std())
        
        loss = 0
        for log_prob, value, reward in zip(self.log_probs, self.state_values, rewards):
            advantage = reward - value.item()
            action_loss = -log_prob * advantage
            value_loss = F.smooth_l1_loss(value, reward)
            loss = loss + action_loss + value_loss
            
        return loss
    
    def clear_memory(self):
        
        del self.log_probs[:]
        del self.state_values[:]
        del self.rewards[:]

In [3]:
def test(model_trained):
    env = gym.make('LunarLander-v2')
    policy = ActorCritic(action_space, observation_space)
    policy.load_state_dict(torch.load(model_trained))

    state = env.reset()
    total_rewards = 0
    for i in range(1000):
        action = policy(state)
        state, reward, finish, _ = env.step(action)
        total_rewards += reward
        env.render()
        img = env.render(mode='rgb_array')
        img = Image.fromarray(img)
        img.save('./gif/{}.jpg'.format(i))
            
        if finish:
            break
                
    print('reward: {}'.format(total_rewards))
        
    env.close()

In [4]:
# set hyperparameters
gamma = 0.99
lr = 0.005
random_seed = 999

torch.manual_seed(random_seed)

<torch._C.Generator at 0x1e1ecd61d10>

In [5]:
env = gym.make('LunarLander-v2')
env.seed(random_seed)

action_space = env.action_space.n
observation_space = env.observation_space.shape[0]

In [6]:
# initialization
policy = ActorCritic(action_space, observation_space)
optimizer = optim.Adam(policy.parameters(), lr=lr)

print('=== start training ===')

total_rewards = 0
for episode in range(5000):
    state = env.reset() # first observation of this episode
    
    for i in range(1000):
        action = policy(state) # choose action
        state, reward, finish, _ = env.step(action) # get next state
        policy.rewards.append(reward)
        total_rewards += reward
        
        if finish:
            break
                    
    # update policy
    optimizer.zero_grad()
    loss = policy.calculate_loss(gamma)
    loss.backward()
    optimizer.step()        
    policy.clear_memory()
        
    # save the model and output result images if average reward > 250
    if total_rewards > 5000:
        torch.save(policy.state_dict(), 'model.pth')
        print('=== start testing ===')
        test('model.pth')
        break
        
    if episode % 20 == 0:
        total_rewards /= 20
        print('episode {}\treward: {}'.format(episode, total_rewards))
        
        total_rewards = 0

=== start training ===




episode 0	reward: -3.623541367590081
episode 20	reward: -198.38652571980262
episode 40	reward: -237.28594451889518
episode 60	reward: -140.69476881770723
episode 80	reward: -113.50247049259376
episode 100	reward: -107.7351451199002
episode 120	reward: -99.7934619501124
episode 140	reward: -114.36156754171911
episode 160	reward: -88.46699121784158
episode 180	reward: -139.75554662908024
episode 200	reward: -52.607110109417235
episode 220	reward: -88.85572418767765
episode 240	reward: -24.326902219236523
episode 260	reward: -38.81775222323289
episode 280	reward: -27.45102829376715
episode 300	reward: -67.18412237415534
episode 320	reward: 0.2179619036713259
episode 340	reward: 25.956794690936572
episode 360	reward: 6.098565557523329
episode 380	reward: 10.667258308481093
episode 400	reward: -4.8250284922650515
episode 420	reward: 14.097604036575834
episode 440	reward: 23.274690445391048
episode 460	reward: -30.134386325358385
episode 480	reward: -29.192558991778903
episode 500	reward: 33