In [1]:
import matplotlib.pyplot as plt
import time
from IPython.display import clear_output
import random

from environment import Environment
from reinforce.model import ReinforceModel

In [2]:
model = ReinforceModel(initial_population=1, state_size=4, action_size=4)

In [3]:
def train(num_episodes, max_steps):
    
    rewards = []
    success_count = 0
    
    for idx in range(num_episodes):
        env = Environment(rows=16, cols=16)

        i = 0
        lifetime_reward = 0

        couldnt_solve = False
        while (not env.is_done()):
            if i == max_steps:
                couldnt_solve = True
                break

            state = env.get_state()
            action, _ = model.predict_action(0, state)

            reward = 0
            if action == 0:
                reward = env.move_up()
            elif action == 1:
                reward = env.move_down()
            elif action == 2:
                reward = env.move_left()
            elif action == 3:
                reward = env.move_right()
        
            lifetime_reward += reward
            
            model.update_reward(0, reward)

            i += 1
        
        clear_output(wait=True)
        if couldnt_solve:
            model.update_reward(0, -1)
            print("\033[91mFailure\033[m")
        else:
            print("\033[92mSuccess\033[m")
            success_count += 1
            
        print(f"Success: [{success_count}/{idx+1}]")

        model.update_all_agents(0)
        
        rewards.append(lifetime_reward)
        
    return rewards, success_count / num_episodes

In [4]:
num_episodes=5000
max_steps = 100

rewards, success_ratio = train(num_episodes=num_episodes, max_steps=max_steps)
print(f"Final success ratio: {success_ratio}")

[92mSuccess[m
Success: [4168/5000]
Final success ratio: 0.8336


In [5]:
def test():
    env = Environment(rows=16, cols=16)
    
    i = 0
    success = True
    while (not env.is_done()):
        if i == max_steps:
            success = False
            break

        state = env.get_state()
        action, _ = model.predict_action(0, state)

        reward = 0
        if action == 0:
            reward = env.move_up()
        elif action == 1:
            reward = env.move_down()
        elif action == 2:
            reward = env.move_left()
        elif action == 3:
            reward = env.move_right()  

        i += 1
    
    return success

In [7]:
success_count = 0
total_runs = 5000

for i in range(total_runs):
    success = test()
    
    clear_output(wait=True)
    if success:
        print("\033[92mSuccess\033[m")
        success_count += 1
    else:
        print("\033[91mFailure\033[m")
        
    print(f"Success: [{success_count} / {(i+1)}]")
    
print(success_count / total_runs)

[92mSuccess[m
Success: [4756 / 5000]
0.9512


In [8]:
import torch

torch.save(model.agents[0].state_dict(), 'experiment-5-5k.pth')