In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from src.river import River

env = River()

In [2]:
class Pi(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=10):
        super(Pi, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

input_size = 1
output_size = 4

model = Pi(input_size, output_size)

In [3]:
def select_action(policy, state, best=False):
    s = np.zeros(10)
    s[state] = 1
    state = torch.from_numpy(s).float().unsqueeze(0)
    # state = torch.tensor([state]).float().unsqueeze(0)
    probs = policy(state)
    if best:
        return torch.argmax(probs)
    else:
        m = torch.distributions.Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

def generate_episode(pi, size=1000):
    rewards = []
    log_probs = []
    
    s = np.random.choice(env.S0)
    for t in range(size):
        a, log_prob = select_action(pi, s)
        s_ = np.random.choice(env.S, p=[env.T(s,a,s_) for s_ in env.S])
        r = env.R(s,a,s_)
        
        rewards.append(r)
        log_probs.append(log_prob)
        if s in env.G:
            break
            # s = np.random.choice(env.S0)
        else:
            s = s_

    return rewards, log_probs

def reinforce(policy, optimizer, gamma=.9, max_iter=1000, epi_sizes=1000):
    mean_rewards = 0
    for i in range(max_iter):
        rewards, log_probs = generate_episode(policy, epi_sizes)
        
        mean_rewards += (sum(rewards) - mean_rewards) / (i+1)
        returns = torch.tensor([gamma**t * sum([gamma**(k-t) * r for k, r in enumerate(rewards[t:])]) for t, _ in enumerate(rewards)])
        
        policy_loss = [-log_prob * R for log_prob, R in zip(log_probs, returns)]
        policy_loss = torch.cat(policy_loss).sum()
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

    return mean_rewards



In [4]:
input_size = 10
output_size = 4 
learning_rate = 1e-5
num_episodes = 1000
episode_size = 100
gamma = 0.999

policy = Pi(input_size, output_size)
# optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
optimizer = optim.SGD(policy.parameters(), lr=learning_rate)

reinforce(policy, optimizer, gamma, num_episodes, episode_size)

-82.79399999999997

In [5]:
env.plot([select_action(policy, s, True) for s in env.S], True)

 _____________________________ 
|  →  |  →  |  →  |  →  |  →  |
|_____|_____|_____|_____|_____|
|  →  |  →  |  →  |  →  |  ↑  |
|_____|_____|_____|_____|_____|

