In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt

import asyncio
import sys
sys.path.append("..")
sys.path.append("../../DolphinRL/Source")
sys.path.append("../../DolphinRL/SamplesSource")


import numpy as np
from SpinnyEnv import SpinnyEnv
from Models.KerasModel import KerasModel
from Methods.TemporalDifference import Sarsa
from Policies import EpsilonGreedyPolicy, GreedyPolicy
from Utilities.Eval import MetricsLogger, validate_policy
from PlotUtilities import LivePlot
from KerasModelBuilders import conv1_model

In [2]:
class ValidationMetrics:
    def __init__(self):
        self.training_avg_reward = 0.0
        self.training_avg_rms = 0.0
        self.validation_avg_reward = 0.0

np.random.seed(643674)
env = SpinnyEnv()
future = asyncio.ensure_future(env.interface.run())
print(env.observation_space.shape)
model = KerasModel(env, conv1_model(env))
training_policy = EpsilonGreedyPolicy(model, 0.1)
mc = Sarsa(env, model, training_policy)
training_log = MetricsLogger(mc.metrics, max_length=100000)
validation_policy = GreedyPolicy(model)
validation_metrics = ValidationMetrics()
validation_log = MetricsLogger(validation_metrics, max_length=10000)


figures = [
    {
        "source": training_log,
        "plots": [
           {
               "metric" : "max_action_value_delta",
               "color": "b"               
           }, 
           {
               "metric" : "rms",
               "color": "orange"               
           }, 
        ]
    },
    {                       
        "source": validation_log,
        "plots": [
           {
               "metric" : "training_avg_reward",
               "color": "b"
           },
           {
               "metric" : "training_avg_rms",
               "color": "orange"
           },
           {
               "metric" : "validation_avg_reward",
               "color": "g"
           }
        ]
    }
]



(320, 240, 3)


In [3]:
livePlot = LivePlot(figures)

<IPython.core.display.Javascript object>

In [4]:
plot_frequency = 201
validation_frequency = 50

training_policy.exploration = 0.1
env.max_steps = 35
mc.alpha = 0.01
model.epochs = 100
validation_episode_count = 200

try:
    episode_count = 50001
    for i in range(episode_count):
        await mc.run_episode()
        training_log.append(mc.metrics)
        
        if i % validation_frequency == validation_frequency-1:
            validation_metrics.training_avg_reward = np.average(training_log.data["episode_reward"][-validation_frequency:])
            validation_metrics.training_avg_rms = np.average(training_log.data["rms"][-validation_frequency:])
            validation_metrics.validation_avg_reward = validate_policy(env, validation_policy, episode_count=validation_episode_count)
            validation_log.append(validation_metrics)
        
        if i % plot_frequency == plot_frequency-1:            
            livePlot.update_plot() 
except KeyboardInterrupt:
    print("Keyborad interrupt")
    
await env.close()
await future

ValueError: operands could not be broadcast together with shapes (480,640,3) (240,320,3) 

In [None]:
np.random.seed(52346)

done = True
episode_reward = 0.0
total_reward = 0.0
episode_count = 0
avg_reward = 0.0

In [None]:
if done:
    obs = env.reset()
    done = False
    if episode_reward != 0:
        episode_reward = 0.0    
        episode_count += 1
        avg_reward = total_reward / episode_count
else:
    action = validation_policy.choose_action(obs)
    obs, reward, done, _ = env.step(action)
    episode_reward += reward
    total_reward += reward
env.render()
print(f"Reward this episode: {episode_reward}") 
print(f"     Average reward: {avg_reward}")
print(f"      Action values: {model.state_values(obs)}")

In [7]:
await env.close()
await future