Домашнее задание 2.2

Задача MountainCar

Возьмем тот же код, что и для решения задачи LunarLender

In [18]:
import torch
from torch import nn 
import torch.nn.functional as F
import numpy as np
import pandas as pd
import gym
import plotly.express as px
import time 
import pickle

In [41]:
class CEMTwoLayer(nn.Module):
    def __init__(self, state_dim, action_n, hidden1, hidden2, lr=0.01, eps=0):
        super().__init__()
        self.state_dim = state_dim
        self.action_n = action_n
        self.lr = lr
        self.eps = eps

        self.network = nn.Sequential(
            nn.Linear(self.state_dim, hidden1),
            nn.ReLU(),
            # nn.Linear(hidden1, hidden2),
            # nn.ReLU(),
            nn.Linear(hidden1, action_n)
        )
        
        self.softmax = nn.Softmax(dim=0)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.CrossEntropyLoss()
        
    def forward(self, _input):
        return self.network(_input) 
    
    def get_action(self, state):
        state = torch.FloatTensor(state)
        logits = self.forward(state)
        action = F.tanh(logits).detach().numpy()
        
        # if eps:
        #     noise = np.ones(self.action_n) / self.action_n
        #     action_prob = (1 - self.eps) * action_prob + self.eps * noise
        #     action_prob = action_prob / np.sum(action_prob)
        # action = np.random.choice(self.action_n, p=action_prob)
        return action
    
    def update_policy(self, elite_states, elite_actions):
        elite_states_tensor = torch.FloatTensor(elite_states)
        elite_actions_tensor = torch.FloatTensor(elite_actions)

        loss = self.loss(self.forward(elite_states_tensor), elite_actions_tensor)
        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()

In [42]:
def generate_batch(env, agent, batch_size, trajectory_len):
    batch_states, batch_actions, batch_rewards = [], [], []

    for _ in range(batch_size):
        states, actions = [], []
        total_reward = 0

        state = env.reset()
        for _ in range(trajectory_len):
            action = agent.get_action(state)
            new_state, reward, done, _ = env.step(action)
            states.append(state)
            actions.append(action)
            total_reward += reward
            state = new_state
            
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
    return batch_states, batch_actions, batch_rewards

In [43]:
def get_elite_states(batch_states, batch_actions, batch_rewards, q_param):
    quantile = np.quantile(batch_rewards, q=q_param) 

    elite_states = []
    elite_actions = []
    for i in range(len(batch_rewards)):
        if batch_rewards[i] > quantile:
            for j in range(len(batch_states[i])):
                elite_states.append(batch_states[i][j])
                elite_actions.append(batch_actions[i][j])
    
    return elite_states, elite_actions

In [52]:
env = gym.make('MountainCarContinuous-v0')

state_dim = env.observation_space.shape[0]
action_n = env.action_space.shape[0]
MAX_ITER = 300
BATCH_SIZE = 100
TRAJECTORY_LEN = 1000

In [53]:
best_model_total_rewards = []

In [54]:
Q = 0.6
lr = 1e-2
eps = 0

agent_two_layer = CEMTwoLayer(state_dim, action_n, hidden1=128, hidden2=0, lr=lr)

for i in range(MAX_ITER):
    # agent_two_layer.eps * 0.99
    batch_states, batch_actions, batch_rewards = generate_batch(env, agent_two_layer, BATCH_SIZE, TRAJECTORY_LEN)
    elite_states, elite_actions = get_elite_states(batch_states, batch_actions, batch_rewards, Q)
    mean_reward = np.mean(batch_rewards)
    best_model_total_rewards.append({'agent': 'two_layer_16_16', 'epoch': i, 'q': Q, 'reward': mean_reward})
    if elite_states:
        agent_two_layer.update_policy(elite_states, elite_actions)
    print(f'{i}: mean_reward={mean_reward}')

time_end = time.time()

0: mean_reward=-9.11713581074355
1: mean_reward=-9.117974319487914
2: mean_reward=-9.11784201173415
3: mean_reward=-9.119566202327249
4: mean_reward=-9.119801890937403
5: mean_reward=-9.120305523208378
6: mean_reward=-9.120415767376214
7: mean_reward=-9.118150711415327
8: mean_reward=-9.119332838755517
9: mean_reward=-9.120592682971337
10: mean_reward=-9.120049834267949
11: mean_reward=-9.118632879394012
12: mean_reward=-9.120476639491784
13: mean_reward=-9.119286406556332
14: mean_reward=-9.117719551902049
15: mean_reward=-9.1184503477136
16: mean_reward=-9.119720454336194
17: mean_reward=-9.119390652363574
18: mean_reward=-9.12169429444862
19: mean_reward=-9.119786344942476
20: mean_reward=-9.120573120080557
21: mean_reward=-9.119494398564747


KeyboardInterrupt: 

Ничего не получилось. Сдаюсь