In [1]:
from __future__ import print_function, division
import gym
import time
import numpy as np
import matplotlib.pyplot as plt
seed = 417
%matplotlib inline

In [2]:
env_name = 'Skiing-v0'
env = gym.make(env_name)

INFO:gym.envs.registration:Making new env: Skiing-v0
[2017-02-07 13:24:02,523] Making new env: Skiing-v0


## Convolutional autoencoder for feature engineering

In [3]:
from keras import backend as K
K.set_image_dim_ordering('th')

Using Theano backend.
Using gpu device 0: GeForce GT 650M (CNMeM is disabled, cuDNN not available)


In [4]:
from Object_detection_features_old import *
from skimage.transform import resize
from skimage.color import rgb2gray
odf = ObjectDetectionFeatures(env)

In [5]:
from keras.models import model_from_json
import json

with open('./data/Encoder_21_01.txt', 'r') as model_file:
     encoder = model_from_json(json.loads(next(model_file)))
        
encoder.load_weights('./data/Encoder_21_01.h5')

## Monte-Carlo Policy Gradient (REINFORCE)

In [6]:
discount = 1.0
render = True

In [7]:
class Agent:
    def __init__(self, env, learning_rate):
        self.nA = env.action_space.n
        n_components = 100
        self.lr = learning_rate
        self.feature_memory = []
        res = np.load('./data/sample_features_20k.npz')['arr_0']
        n = res.shape[0]
        observation_examples = np.hstack((res.reshape((n, -1)), 
                                          np.array([env.action_space.sample() for _ in range(n)])[:, np.newaxis]))

        self.scaler = StandardScaler()
        
        self.feature_map = FeatureUnion([("rbf1", RBFSampler(n_components=n_components, gamma=1., random_state=1)),
                                         ("rbf01", RBFSampler(n_components=n_components, gamma=0.1, random_state=1)),
                                         ("rbf10", RBFSampler(n_components=n_components, gamma=10, random_state=1))])

        self.feature_map.fit(self.scaler.fit_transform(observation_examples))
        
        self.theta = np.random.rand(3 * n_components)
    
    def act(self, s):
        #s_tmp = s.reshape(tuple([1] + list(np.roll(s.shape, 1))))
        #s_tmp = np.concatenate((s_tmp, np.zeros((s_tmp.shape[0], 3, 6, s_tmp.shape[3]), dtype='uint8')), axis=2)
        #s_tmp = encoder.predict(s_tmp.astype('float16')/255).ravel()
        #Phi_s = self.feature_map.transform(self.scaler.transform(
        #        np.hstack((np.arange(self.nA)[:, np.newaxis], np.repeat(s_tmp[np.newaxis, :], self.nA, axis=0)))))
        
        Phi_s = self.feature_map.transform(self.scaler.transform(
                np.hstack((np.arange(self.nA)[:, np.newaxis], 
                           np.repeat(encoder.predict(s[np.newaxis, np.newaxis, :, :]), self.nA, axis=0)))))
        
        
        self.feature_memory.append(Phi_s)
        self.probs = np.exp(np.dot(Phi_s, self.theta))
        return np.random.choice(self.nA, p=self.probs/np.sum(self.probs))
    
    def update(self, memory):
        gamma = 1 # 0.999
        G = np.cumsum(map(lambda x: x[2], memory)[::-1])[::-1]
        
        for t in range(len(memory)):
            self.theta += self.lr * (self.feature_memory[t][memory[t][1]] - \
                                     np.sum(self.probs[:, np.newaxis] * self.feature_memory[t], axis=0)) * G[t]
        
        self.feature_memory = []

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import FeatureUnion

In [9]:
agent = Agent(env, 0.01)

In [None]:
env = wrappers.Monitor(env, '/tmp/skiing-0', force=True)

In [64]:
for e in range(5):
    s = env.reset()
    episode = 0
    done = False
    total_reward = 0
    memory = []
    output_shape = (60, 60)
    
    while not done:
        if render: env.render()
        s_processed = (resize(odf.get_simple_image(env.ale.getScreenGrayscale()[:, :, 0]), 
                              output_shape, order=0) * 255).astype('uint8')
        a = agent.act(s_processed)
        sp, r, done, _ = env.step(a)
        memory.append((s, a, r, sp))
        total_reward += r
        s = sp
        episode += 1
    
    print('episode {} finished in {} steps with total reward: {}'.format(e, episode, total_reward))
    
    agent.update(memory)

episode 0 finished in 2653 steps with total reward: -22771.0
episode 1 finished in 1202 steps with total reward: -15039.0
episode 2 finished in 1179 steps with total reward: -14937.0
episode 3 finished in 1021 steps with total reward: -13602.0
episode 4 finished in 713 steps with total reward: -9013.0


In [None]:
gym.upload('/tmp/skiing-0', api_key='...')