In [1]:
import matplotlib.pyplot as plt
import time
from IPython.display import clear_output
import random

from environment import Environment
from reinforce.model import ReinforceModel
# from ddpg.model import DDPGModel

In [2]:
model = ReinforceModel(initial_population=1, state_size=8, action_size=4)

In [3]:
def train(num_episodes, max_steps):
    
    rewards = []
    success_count = 0
    
    for idx in range(num_episodes):
        env = Environment(rows=16, cols=16)

        i = 0
        lifetime_reward = 0

        couldnt_solve = False
        while (not env.is_done()):
            if i == max_steps:
                couldnt_solve = True
                break

            state = env.get_state()
            action, _ = model.predict_action(0, state)

            reward = 0
            if action == 0:
                reward = env.move_up()
            elif action == 1:
                reward = env.move_down()
            elif action == 2:
                reward = env.move_left()
            elif action == 3:
                reward = env.move_right()
        
            lifetime_reward += reward
            
            model.update_reward(0, reward)
            
#             next_state = env.get_state()
#             model.update_next_state(0, next_state)

            i += 1
        
        clear_output(wait=True)
        if couldnt_solve:
            model.update_reward(0, -1)
            print("\033[91mFailure\033[m")
        else:
            print("\033[92mSuccess\033[m")
            success_count += 1
            
        print(f"Success: [{success_count}/{idx+1}]")

        model.update_all_agents(0)
#         model.agents[0].learn(experiences=model.agents[0].memory.sample(), gamma=0.95)
        
        rewards.append(lifetime_reward)
        
    return rewards, success_count / num_episodes

In [4]:
num_episodes=20000
max_steps = 100

rewards, success_ratio = train(num_episodes=num_episodes, max_steps=max_steps)
print(f"Final success ratio: {success_ratio}")

[92mSuccess[m
Success: [19308/20000]
Final success ratio: 0.9654


In [5]:
def test():
    env = Environment(rows=16, cols=16)
    
    i = 0
    while (not env.is_done()):
        if i == max_steps:
            couldnt_solve = True
            break

        state = env.get_state()
        action, _ = model.predict_action(0, state)

        reward = 0
        if action == 0:
            reward = env.move_up()
        elif action == 1:
            reward = env.move_down()
        elif action == 2:
            reward = env.move_left()
        elif action == 3:
            reward = env.move_right()

        clear_output(wait=True)
        print(f"Distance: {-env.compute_abs_xy_distance()}")
        env.render()    

        i += 1
        
        time.sleep(0.1)
    

In [None]:
test()

Distance: 14
O O O O O O O O O O O O O O O O 
O O [91mF[m O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O O O O O O O O O O O O O O O 
O O [94mX[m O O O O O O O O O O O O O 



In [None]:
import torch

torch.save(model.agents[0].state_dict(), 'reinforce-model-20k-episodes.pth')