In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

import sys
sys.path.append("..\\Source")

import numpy as np
from Environments.CleanBotEnv import CleanBotEnv
from Models.TableModel import TableModel
from Methods.MonteCarloMethods import ConstAlphaMC, AveragingMC
from Methods.Policies import EpsilonGreedyPolicy, GreedyPolicy
from Utilities.Eval import StatsLogger, ScrollingPlot, test_policy

In [None]:
class TestingStats:
    def __init__(self):
        self.training_avg_reward = 0.0
        self.testing_avg_reward = 0.0

np.random.seed(643674)
env = CleanBotEnv(4)
model = TableModel(env)
training_policy = EpsilonGreedyPolicy(model, 0.1)
mc = ConstAlphaMC(env, model, training_policy)
trainingStats = StatsLogger(mc.stats, max_length=100000)
testing_policy = GreedyPolicy(model)
testingStat = TestingStats()
testingStats = StatsLogger(testingStat, max_length=2000)

figures = [
    {
        "source": trainingStats,
        "plots": [
           {
               "stat" : "max_action_value_delta",
               "color": "b"               
           }
        ]
    },
    {                       
        "source": testingStats,
        "plots": [
           {
               "stat" : "training_avg_reward",
               "color": "b"
           },
           {
               "stat" : "testing_avg_reward",
               "color": "g"
           }
        ]
    }
]



In [None]:
plotHelper = ScrollingPlot(figures)

In [None]:
plot_upate_steps = 2000
testing_update_steps = 500

training_policy.exploration = 0.1
env.max_steps = 100
episode_count = 5000000
mc.alpha = 0.005
testing_episode_count = 200

try:    
    for i in range(episode_count):
        mc.run_episode()
        trainingStats.append(mc.stats)
        
        if i % testing_update_steps == testing_update_steps-1:
            testingStat.training_avg_reward = np.average(trainingStats.data["episode_reward"][-testing_update_steps:])
            testingStat.testing_avg_reward = test_policy(env, testing_policy, episode_count=testing_episode_count)
            testingStats.append(testingStat)
        
        if i % plot_upate_steps == plot_upate_steps-1:            
            plotHelper.update_plot() 
except KeyboardInterrupt:
    print("Keyborad interrupt")

In [None]:
np.random.seed(52346)

done = True
episode_reward = 0.0
total_reward = 0.0
episode_count = 0
avg_reward = 0.0

In [None]:
if done:
    obs = env.reset()
    done = False
    if episode_reward != 0:
        episode_reward = 0.0    
        episode_count += 1
        avg_reward = total_reward / episode_count
else:
    action = testing_policy.choose_action(obs)
    obs, reward, done, _ = env.step(action)
    episode_reward += reward
    total_reward += reward
env.render()
print(f"Reward this episode: {episode_reward}") 
print(f"     Average reward: {avg_reward}")
print(f"      Action values: {model.action_values(obs)}")