Домашнее задание 2

Исследуем алгоритм Кросс-Энтропии с использованием нейронных сетей.

In [1]:
import torch
from torch import nn 
import numpy as np
import pandas as pd
import gym
import plotly.express as px
import time
import pickle



In [2]:
class CEM(nn.Module):
    def __init__(self, state_dim, action_n, hidden, lr=0.01, eps=0):
        super().__init__()
        self.state_dim = state_dim
        self.action_n = action_n
        self.lr = lr
        self.eps = eps

        self.network = nn.Sequential(
            nn.Linear(self.state_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, action_n)
        )
        
        self.softmax = nn.Softmax(dim=0)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.CrossEntropyLoss()
        
    def forward(self, _input):
        return self.network(_input) 
    
    def get_action(self, state):
        state = torch.FloatTensor(state)
        logits = self.forward(state)
        action_prob = self.softmax(logits).detach().numpy()

        noise = np.ones(self.action_n) / self.action_n
        action_prob_noised = (1 - self.eps) * action_prob + self.eps * noise
        action_prob_noised = action_prob_noised / np.sum(action_prob_noised)
        action = np.random.choice(self.action_n, p=action_prob_noised)
        return action
    
    def update_policy(self, elite_states, elite_actions):
        elite_states_tensor = torch.FloatTensor(elite_states)
        elite_actions_tensor = torch.LongTensor(elite_actions)

        loss = self.loss(self.forward(elite_states_tensor), elite_actions_tensor)
        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()

In [3]:
def get_trajectory(env, agent, trajectory_len, visualize=False):
    trajectory = {'states':[], 'actions': [], 'total_reward': 0}
    
    state = env.reset()
    trajectory['states'].append(state)
    
    for _ in range(trajectory_len):
        action = agent.get_action(state)
        trajectory['actions'].append(action)
        
        state, reward, done, _ = env.step(action)
        trajectory['total_reward'] += reward
        
        if done:
            break
    
        if visualize:
            env.render()
            
        trajectory['states'].append(state)
    return trajectory

def get_elite_trajectories(trajectories, q_param):
    total_rewards = [trajectory['total_reward'] for trajectory in trajectories]
    quantile = np.quantile(total_rewards, q=q_param) 
    return [trajectory for trajectory in trajectories if trajectory['total_reward'] > quantile]

In [4]:
def generate_batch(env, agent, batch_size, trajectory_len):
    batch_states, batch_actions, batch_rewards = [], [], []

    for _ in range(batch_size):
        states, actions = [], []
        total_reward = 0

        state = env.reset()
        for _ in range(trajectory_len):
            action = agent.get_action(state)
            new_state, reward, done, _ = env.step(action)
            states.append(state)
            actions.append(action)
            total_reward += reward
            state = new_state
            
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
    return batch_states, batch_actions, batch_rewards

In [5]:
def get_elite_states(batch_states, batch_actions, batch_rewards, q_param):
    quantile = np.quantile(batch_rewards, q=q_param) 

    elite_states = []
    elite_actions = []
    for i in range(len(batch_rewards)):
        if batch_rewards[i] > quantile:
            for j in range(len(batch_states[i])):
                elite_states.append(batch_states[i][j])
                elite_actions.append(batch_actions[i][j])
    
    return elite_states, elite_actions

In [6]:
env = gym.make(
    "LunarLander-v2",
    continuous = False,
    gravity = -10.0,
    enable_wind = False,
    wind_power = 15.0,
    turbulence_power = 1.5)

state_dim = 8
action_n = 4
lr = 1e-2
eps = 0

trajectory_n = 500

MAX_ITER = 300
BATCH_SIZE = 50
TRAJECTORY_LEN = 1000
Q = 0.75

Пробуем однослойную полносвязную архитектуру с разным количество слоев.
Гиперпараметры пока оставляю фиксированные, показавшиеся наиболее оптимальными на предыдущих запусках.

In [7]:
total_rewards = []

In [8]:
agent = CEM(state_dim, action_n, hidden=16, lr=lr)

for i in range(MAX_ITER):
    batch_states, batch_actions, batch_rewards = generate_batch(env, agent, BATCH_SIZE, TRAJECTORY_LEN)
    elite_states, elite_actions = get_elite_states(batch_states, batch_actions, batch_rewards, Q)
    
    mean_reward = np.mean(batch_rewards)
    agent.update_policy(elite_states, elite_actions)
    print(f'{i}: mean_reward={mean_reward}')
    total_rewards.append({'agent': 'one_layer_16', 'epoch': i, 'reward': mean_reward})

time_end = time.time()


  elite_states_tensor = torch.FloatTensor(elite_states)


0: mean_reward=-191.48395824124694
1: mean_reward=-174.2419655171212
2: mean_reward=-208.92693535157505
3: mean_reward=-205.46261584454933
4: mean_reward=-189.12988426996998
5: mean_reward=-177.145662975239
6: mean_reward=-163.50331161796714
7: mean_reward=-173.64757818948314
8: mean_reward=-180.31325101368782
9: mean_reward=-175.11206418323044
10: mean_reward=-165.17564633666848
11: mean_reward=-191.588429659361
12: mean_reward=-172.04514112870672
13: mean_reward=-166.550656179244
14: mean_reward=-149.70976027911104
15: mean_reward=-156.3929513224928
16: mean_reward=-174.412732258016
17: mean_reward=-148.71432268954584
18: mean_reward=-170.89355312810065
19: mean_reward=-140.32884151437483
20: mean_reward=-166.3467893099404
21: mean_reward=-160.83282092929824
22: mean_reward=-149.57180365839602
23: mean_reward=-147.77366404994933
24: mean_reward=-147.50249041027158
25: mean_reward=-149.52373063988506
26: mean_reward=-150.93125737988822
27: mean_reward=-136.11214449074043
28: mean_rewa

In [9]:
agent = CEM(state_dim, action_n, hidden=64, lr=lr)

for i in range(MAX_ITER):
    batch_states, batch_actions, batch_rewards = generate_batch(env, agent, BATCH_SIZE, TRAJECTORY_LEN)
    elite_states, elite_actions = get_elite_states(batch_states, batch_actions, batch_rewards, Q)
    
    mean_reward = np.mean(batch_rewards)
    total_rewards.append({'agent': 'one_layer_64', 'epoch': i, 'reward': mean_reward})
    agent.update_policy(elite_states, elite_actions)
    print(f'{i}: mean_reward={mean_reward}')

time_end = time.time()


0: mean_reward=-150.08397642617263
1: mean_reward=-153.8781485057967
2: mean_reward=-134.68879070115284
3: mean_reward=-170.11063683273758
4: mean_reward=-129.81758142493538
5: mean_reward=-140.37314885070774
6: mean_reward=-139.72323002504956
7: mean_reward=-135.48878488520876
8: mean_reward=-126.86427345735993
9: mean_reward=-133.98498217638763
10: mean_reward=-115.63781789412015
11: mean_reward=-120.91872621527631
12: mean_reward=-126.43047371608245
13: mean_reward=-110.58195337311831
14: mean_reward=-133.63219182848368
15: mean_reward=-121.83627462981207
16: mean_reward=-118.98237893094343
17: mean_reward=-120.09562860469279
18: mean_reward=-109.90294009616127
19: mean_reward=-119.30867288377065
20: mean_reward=-115.17798873414577
21: mean_reward=-118.2156888908358
22: mean_reward=-99.16318635243547
23: mean_reward=-97.2033748049157
24: mean_reward=-94.56923364631061
25: mean_reward=-93.74715547046318
26: mean_reward=-93.91762917475411
27: mean_reward=-88.11654764737101
28: mean_re

In [10]:
agent = CEM(state_dim, action_n, hidden=256, lr=lr)

for i in range(MAX_ITER):
    batch_states, batch_actions, batch_rewards = generate_batch(env, agent, BATCH_SIZE, TRAJECTORY_LEN)
    elite_states, elite_actions = get_elite_states(batch_states, batch_actions, batch_rewards, Q)
    
    mean_reward = np.mean(batch_rewards)
    total_rewards.append({'agent': 'one_layer_256', 'epoch': i, 'reward': mean_reward})
    agent.update_policy(elite_states, elite_actions)
    print(f'{i}: mean_reward={mean_reward}')

time_end = time.time()

0: mean_reward=-170.9517475155726
1: mean_reward=-177.76396783949812
2: mean_reward=-126.4649402815442
3: mean_reward=-133.99747917274092
4: mean_reward=-142.8822074646124
5: mean_reward=-126.2294885014729
6: mean_reward=-137.82938457314307
7: mean_reward=-144.30658174206317
8: mean_reward=-120.67569254207058
9: mean_reward=-134.7147150647002
10: mean_reward=-161.625951351328
11: mean_reward=-119.36711716145406
12: mean_reward=-134.6772499184125
13: mean_reward=-125.51601967020125
14: mean_reward=-102.96500345271593
15: mean_reward=-98.0502944056614
16: mean_reward=-116.23951306471827
17: mean_reward=-91.00784719796397
18: mean_reward=-92.5175231840244
19: mean_reward=-71.82859532006995
20: mean_reward=-101.19113173569457
21: mean_reward=-86.63530398486206
22: mean_reward=-62.83550259616471
23: mean_reward=-96.53232173085266
24: mean_reward=-75.73083271587015
25: mean_reward=-101.95891483657512
26: mean_reward=-145.97521332017277
27: mean_reward=-151.06164894829138
28: mean_reward=-192

Результаты работы нейросети с разным количество нейронов

In [16]:
df = pd.DataFrame.from_records(total_rewards)
px.line(df, x='epoch', y='reward', color='agent', title='300 epoch, batch=100, q=0.75')

По результатам обучения трех нейросетей можно сделать вывод, что сеть с меньшим количество слоев достигла своего локлаьго максимума на уровне 220-230. При этом, на удивление, сеть с 256 нейронами за 300 эпох не смогла превзойти сеть с 16 нейронами.

Для дальнейшего тюнинга за основу возьмем сеть с 64 нейронами, как лучшую с точки зрения времени/награды

1. Попробуем уменьшить Q, чтобы сеть получала больше данных

In [19]:
best_model_total_rewards = []

In [20]:
Q = 0.6
agent = CEM(state_dim, action_n, hidden=64, lr=lr)

for i in range(MAX_ITER):
    batch_states, batch_actions, batch_rewards = generate_batch(env, agent, BATCH_SIZE, TRAJECTORY_LEN)
    elite_states, elite_actions = get_elite_states(batch_states, batch_actions, batch_rewards, Q)
    
    mean_reward = np.mean(batch_rewards)
    best_model_total_rewards.append({'agent': 'one_layer_64', 'epoch': i, 'q': Q, 'reward': mean_reward})
    agent.update_policy(elite_states, elite_actions)
    print(f'{i}: mean_reward={mean_reward}')

time_end = time.time()

0: mean_reward=-190.69847243580236
1: mean_reward=-164.15375744393245
2: mean_reward=-171.13098045184842
3: mean_reward=-147.66547248727593
4: mean_reward=-145.85726182513505
5: mean_reward=-149.20327303411204
6: mean_reward=-157.7072770682559
7: mean_reward=-132.72246200873684
8: mean_reward=-128.6978596180105
9: mean_reward=-135.33033186089975
10: mean_reward=-121.61857293351731
11: mean_reward=-118.29999067666137
12: mean_reward=-109.79905275862797
13: mean_reward=-107.16748576709023
14: mean_reward=-110.35832213001538
15: mean_reward=-110.05793000768037
16: mean_reward=-127.31265609265348
17: mean_reward=-111.7955191548696
18: mean_reward=-108.45226401971851
19: mean_reward=-102.77959005900433
20: mean_reward=-99.50226583202286
21: mean_reward=-98.42118025915117
22: mean_reward=-94.90254835581877
23: mean_reward=-93.24955228503106
24: mean_reward=-92.87116415356209
25: mean_reward=-95.44743477539889
26: mean_reward=-76.89444213425851
27: mean_reward=-85.76053289527222
28: mean_rewa

KeyboardInterrupt: 