# 1. Import modules

In [1]:
import numpy as np
import time
import gym
import torch
import torch.nn as nn
import random

# 2. Define Classes

In [2]:
class Env():
    def __init__(self):
        self.env = gym.make('CarRacing-v0')
        self.reward_threshold = self.env.spec.reward_threshold

    def reset(self):
        self.counter = 0
        self.av_r = self.reward_memory()
        self.die = False
        img_rgb = self.env.reset()
        img_gray = self.rgb2gray(img_rgb)
        self.stack = [img_gray] * 4
        return np.array(self.stack)

    def step(self, action):
        total_reward = 0
        for i in range(8):
            img_rgb, reward, die, _ = self.env.step(action)
            # don't penalize "die state"
            if die:
                reward += 100
            # green penalty
            if np.mean(img_rgb[:, :, 1]) > 185.0:
                reward -= 0.05
            total_reward += reward
            # if no reward recently, end the episode
            done = True if self.av_r(reward) <= -0.1 else False
            if done or die:
                break
        img_gray = self.rgb2gray(img_rgb)
        self.stack.pop(0)
        self.stack.append(img_gray)
        assert len(self.stack) == 4
        return np.array(self.stack), total_reward, done, die

    def render(self):
        self.env.render()

    def close(self):
        self.env.close()
        
    @staticmethod
    def rgb2gray(rgb, norm=True):
        gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
        if norm:
            # normalize
            gray = gray / 128. - 1.
        return gray

    @staticmethod
    def reward_memory():
        count = 0
        length = 100
        history = np.zeros(length)

        def memory(reward):
            nonlocal count
            history[count] = reward
            count = (count + 1) % length
            return np.mean(history)

        return memory


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.cnn_base = nn.Sequential(  # input shape (4, 96, 96)
            nn.Conv2d(4, 8, kernel_size=4, stride=2),
            nn.ReLU(),  # activation
            nn.Conv2d(8, 16, kernel_size=3, stride=2),  # (8, 47, 47)
            nn.ReLU(),  # activation
            nn.Conv2d(16, 32, kernel_size=3, stride=2),  # (16, 23, 23)
            nn.ReLU(),  # activation
            nn.Conv2d(32, 64, kernel_size=3, stride=2),  # (32, 11, 11)
            nn.ReLU(),  # activation
            nn.Conv2d(64, 128, kernel_size=3, stride=1),  # (64, 5, 5)
            nn.ReLU(),  # activation
            nn.Conv2d(128, 256, kernel_size=3, stride=1),  # (128, 3, 3)
            nn.ReLU(),  # activation
        )  # output shape (256, 1, 1)
        self.v = nn.Sequential(nn.Linear(256, 100), nn.ReLU(), nn.Linear(100, 1))
        self.fc = nn.Sequential(nn.Linear(256, 100), nn.ReLU())
        self.alpha_head = nn.Sequential(nn.Linear(100, 3), nn.Softplus())
        self.beta_head = nn.Sequential(nn.Linear(100, 3), nn.Softplus())
        self.apply(self._weights_init)

    @staticmethod
    def _weights_init(m):
        if isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain('relu'))
            nn.init.constant_(m.bias, 0.1)

    def forward(self, x):
        x = self.cnn_base(x)
        x = x.view(-1, 256)
        v = self.v(x)
        x = self.fc(x)
        alpha = self.alpha_head(x) + 1
        beta = self.beta_head(x) + 1

        return (alpha, beta), v


class Agent():
    def __init__(self):
        self.net = Net().float().to(device)

    def select_action(self, state):
        state = torch.from_numpy(state).float().to(device).unsqueeze(0)
        with torch.no_grad():
            alpha, beta = self.net(state)[0]
        action = alpha / (alpha + beta)

        action = action.squeeze().cpu().numpy()
        return action

    def load_param(self):
        self.net.load_state_dict(torch.load('param/expert.pkl'))

# 3. Save trajectory

In [5]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

agent = Agent()
agent.load_param()
env = Env()

training_records = []
running_score = 0
state = env.reset()

X, Y = [], []
idx = 1
for i_ep in range(5000):
    score = 0
    state = env.reset()

    for t in range(1000):
        action = agent.select_action(state)
        
        if random.randint(0,10) == 9:
            X.append(state)
            Y.append(action)

        state_, reward, done, die = env.step(action * np.array([2., 1., 1.]) + np.array([-1.5, 0., 0.]))
        # time.sleep(0.05)
        # env.render()
        score += reward
        state = state_
        if done or die:
            break

    print('Ep {}\tScore: {:.2f}\t'.format(i_ep, score))

    print(i_ep, len(X))
    
    '''
    if i_ep % 1000 == 999: 
        np.save('features{}.npy'.format(idx), X)
        np.save('labels{}.npy'.format(idx), Y)
        X, Y = [], []
        idx += 1
    '''

Track generation: 1141..1430 -> 289-tiles track




Track generation: 1220..1529 -> 309-tiles track
Ep 0	Score: 5.33	
0 1
Track generation: 1293..1620 -> 327-tiles track
Ep 1	Score: 3.90	
1 3
Track generation: 1124..1409 -> 285-tiles track
Ep 2	Score: 6.83	
2 3
Track generation: 1112..1399 -> 287-tiles track
Ep 3	Score: 6.08	
3 4
Track generation: 1213..1522 -> 309-tiles track
retry to generate track (normal if there are not manyinstances of this message)
Track generation: 1052..1319 -> 267-tiles track
Ep 4	Score: 8.16	
4 5
Track generation: 1064..1334 -> 270-tiles track
Ep 5	Score: 8.10	
5 5
Track generation: 1191..1493 -> 302-tiles track
Ep 6	Score: 5.73	
6 6
Track generation: 1092..1369 -> 277-tiles track
Ep 7	Score: 7.49	
7 9
Track generation: 1196..1499 -> 303-tiles track
Ep 8	Score: 5.67	
8 10
Track generation: 1031..1293 -> 262-tiles track
Ep 9	Score: 8.74	
9 11
Track generation: 1064..1334 -> 270-tiles track
Ep 10	Score: 8.05	
10 15
Track generation: 1016..1274 -> 258-tiles track
Ep 11	Score: 9.05	
11 16
Track generation: 1070..

KeyboardInterrupt: 