# Cross-Entropy Method

In this notebook we will train OpenAI Gym's MountainCarContinous Environment with Cross-Entropy Method

# 1. Importing Necessarry Packages

In [3]:
import gym
import math
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

# 2. Instantiate the Environment

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

env = gym.make('MountainCarContinuous-v0')
env.seed(101)
np.random.seed(101)

print('observation_space :', env.observation_space)
print('action_space :', env.action_space)
print(' - low :', env.action_space.low)
print(' - high : ', env.action_space.high)

observation_space : Box(2,)
action_space : Box(1,)
 - low : [-1.]
 - high :  [ 1.]


# 3. Instantiate the Agent

In [45]:
class Agent(nn.Module):
    def __init__(self, env, h_size=16):
        super(Agent, self).__init__()
        self.env = env
        # state, hidden_layer, action_sizes
        self.s_size = env.observation_space.shape[0]
        self.h_size = h_size
        self.a_size = env.action_space.shape[0]
        # define layers
        self.fc1 = nn.Linear(self.s_size, self.h_size)
        self.fc2 = nn.Linear(self.h_size, self.a_size)
        
    def forward(self , x):
        x = F.relu(self.fc1(x))
        x = F.tanh(self.fc2(x))
        return x
    
    def set_weights(self, weights):
        s_size = self.s_size
        h_size = self.h_size
        a_size = self.a_size
        # Seperate the weights for each layer
        
    def get_weights_dim(self):
        return (self.s_size+1)*self.h_size + (self.h_size+1)*self.a_size
        
    
agent = Agent(env).to(device)
agent

Agent(
  (fc1): Linear(in_features=2, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=1, bias=True)
)

In [46]:
state_size = env.observation_space.shape[0]
hidden_size = 16
action_size = env.action_space.shape[0]
print(state_size, hidden_size, action_size)

fc1_end = (state_size*hidden_size) + hidden_size
print("fc1_end :", fc1_end)

# Weights dimensions 
total_weights = (state_size + 1)*hidden_size + (hidden_size + 1)*action_size
print("total_weights : ", total_weights)




2 16 1
fc1_end : 48
total_weights :  65


In [47]:
print(model.fc1.weight)

print(model.fc1.bias)

Parameter containing:
tensor([[-0.3342,  0.0161],
        [-0.4488,  0.0252],
        [-0.6735,  0.0413],
        [ 0.0717,  0.4033],
        [-0.1758,  0.4150],
        [-0.5208, -0.6473],
        [ 0.3638,  0.7002],
        [ 0.4264, -0.1199],
        [ 0.0311,  0.1037],
        [ 0.1214, -0.5707],
        [ 0.5873,  0.5055],
        [-0.4829, -0.5808],
        [-0.3830, -0.6918],
        [ 0.2895, -0.0285],
        [ 0.0481, -0.6772],
        [ 0.4852,  0.2672]], requires_grad=True)
Parameter containing:
tensor([-0.5706,  0.4886,  0.4718, -0.6682,  0.4775,  0.0304, -0.6244, -0.1343,
        -0.5600, -0.5578, -0.1376, -0.2949, -0.0595,  0.6909, -0.6566, -0.0545],
       requires_grad=True)


In [48]:
print(env.observation_space.shape[0])

2


In [49]:
print(env.action_space.shape[0])

1


# 3. Train the agent with the Cross-Entropy Method


In [56]:
sigma = 0.5
pop_size = 50
elite_frac = 0.2
num_iter = 500
max_t = 1000

"""
Paramters
======
    num_iter (int) : maximum number of iterations
    max_t (int) : maximum number of time steps per episode
    gamma (float) : discount rate
    pop_size (int) : population_size
    elite_frac (float) : percentage of top performace to use in update
    sigma (float) : standard deviation of additive noice

"""

num_elite = pop_size * elite_frac
print("num_elite :", num_elite)
scores_deque = deque(maxlen=100)
scores = []
best_weight = sigma*np.random.randn(agent.get_weights_dim())
print(best_weight)

for i_iteration in range(1, num_iter+1):
    

num_elite : 10.0
[ -3.64733168e-01  -3.60054874e-01  -1.42628383e-01   1.04526957e+00
   5.02028074e-01   5.15957930e-01  -5.40559139e-01  -1.77480545e-04
  -1.09940774e-01   1.46397565e-01   4.17177261e-01  -7.02159155e-02
   5.95724812e-02   3.32857293e-01   2.03793225e-01   4.03921372e-01
  -1.36823222e+00  -4.92576427e-01  -7.76378752e-02  -5.06614006e-01
  -5.43717945e-01  -1.28388346e+00   3.30514599e-01  -1.66460400e-01
  -4.64301137e-01   8.57975083e-01  -2.34378143e-01   4.30048518e-01
   1.15043522e-01   3.09328997e-01  -1.02628146e+00  -8.33230580e-02
  -2.22936391e-01   3.42936882e-01   1.39250483e+00   7.61875229e-01
   6.45114965e-01  -7.39159682e-01   1.07117097e-01  -1.20254974e-01
  -7.10474738e-01   1.60829793e-01   5.03662066e-01   1.64891826e-01
   4.60262586e-01   6.74568762e-01  -1.88930747e-01   9.77709944e-01
  -4.29152106e-01  -1.11105122e-01  -2.56014319e-02  -2.19564012e-01
  -7.50520618e-01   5.29178463e-01   9.27483581e-01   2.51125168e-01
   5.72904383e-02

In [None]:
weights_pop = [ best_weight + () ]