In [1]:
import numpy as np
from IPython.display import clear_output
import glob
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
ENVIRONMENT_SIZE = 16
BATCH_SIZE = 32

In [3]:
class BCDataset(Dataset):
    
    def __init__(self, npy_dir):
        files = glob.glob(os.path.join(npy_dir, "*.npy"))
        self.trajectories = []
        
        for file in files:
            self.trajectories.append(np.load(file, allow_pickle=True))
        
        self.trajectories = np.vstack(self.trajectories)
        
    def __len__(self):
        return len(self.trajectories)
    
    def __getitem__(self, idx):
        state, action = self.trajectories[idx]
#         state = [state[i] / ENVIRONMENT_SIZE if (state[i] != -1) and (i in state[0::4] or i in state[1::4]) 
#                  else state[i]
#                  for i in range(len(state))]
        return np.array(state, dtype=np.float32), action

In [4]:
training_dataset = BCDataset("expert-dir")

In [5]:
len(training_dataset)

5500

In [12]:
from collections import Counter, OrderedDict

def get_training_distribution(dataset):
    actions = Counter([dataset[i][1] for i in range(len(dataset))])
    actions = {action:1/count for action, count in actions.items()}
    actions = OrderedDict(sorted(actions.items()))
    return torch.tensor(list(actions.values()))

In [13]:
training_dataset[0]

(array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,  8.,
         1.,  1.,  1.,  6.,  0.,  1., -1.], dtype=float32), 0)

In [22]:
arr = [-1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,  8.,
         1.,  1.,  1.,  6.,  0.,  1., -1.]

In [31]:
list(set([arr[i:i + 4] for i in range(0, len(arr), 4)][0])) == [-1.0]

True

In [14]:
training_dataloader = DataLoader(training_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

In [15]:
len(training_dataloader)

172

In [19]:
class BCModel(nn.Module):
    
    def __init__(self, state_size, action_size, hidden_size=256):
        super(BCModel, self).__init__()
        
        self.fc1 = nn.Linear(in_features=state_size, out_features=hidden_size)
        self.leaky_relu_1 = nn.LeakyReLU()
        self.fc2 = nn.Linear(in_features=hidden_size, out_features=hidden_size//2)
        self.leaky_relu_2 = nn.LeakyReLU()
        self.fc3 = nn.Linear(in_features=hidden_size//2, out_features=action_size)
        
    def forward(self, x):
        out = self.leaky_relu_1(self.fc1(x))
        out = self.fc2(out)
        out = self.fc3(out)
        
        return F.softmax(out, dim=1)

In [20]:
STATE_SIZE = 20
ACTION_SIZE = 5

model = BCModel(state_size=STATE_SIZE, action_size=ACTION_SIZE)

loss_fn = nn.CrossEntropyLoss(weight=get_training_distribution(training_dataset))
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [21]:
from tqdm import tqdm, trange

EPISODES = 2000

episode_losses = []
t = trange(EPISODES, desc="Episode")
for current_episode_num in t:
    current_episode_loss = 0
    
    for i, data in enumerate(training_dataloader):
        states, actions = data
        
        optimizer.zero_grad()
        
        predicted_actions = model(states.float())
        loss = loss_fn(predicted_actions, actions)
        loss.backward()
        optimizer.step()
        
        current_episode_loss += loss.item()
        
    t.set_description(f"Loss {current_episode_loss / len(training_dataloader)}")
    t.refresh()
       
    episode_losses.append(current_episode_loss / len(training_dataloader))   

Loss 1.091066651566084:  15%|█▌        | 306/2000 [02:30<13:55,  2.03it/s] 


KeyboardInterrupt: 

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

plt.figure(figsize=(15, 10))
plt.plot(episode_losses)
plt.show()

In [None]:
import time

from environment import Environment

def test(max_steps, speed=0.5, agent_pos=None, food_pos=None, render=True):
    model.eval()
    
    env = Environment(rows=16, cols=16, scope=10)

    if agent_pos != None:
        env.current_pos = env.pos(agent_pos[0], agent_pos[1])

    if food_pos != None:
        env.food = env.pos(food_pos[0], food_pos[1])

    i = 0
    success = True
    while (not env.is_done()):
        clear_output(wait=True)
        print(f"Step: {i+1}, Food: {env.consumed_count}")
        if i == max_steps or env.num_food == 0:
            success = False
            break

        state = env.get_state()
        state = torch.from_numpy(state).unsqueeze(0)
        
        with torch.no_grad():
            action_probs = model(state.float())
            print(f"Action prob: {action_probs}")
            _, action = torch.max(action_probs, 1)
            action = action.item()
            print(f"Action: {action}")

        reward = 0
        if action == 0:
            reward = env.move_up()
        elif action == 1:
            reward = env.move_left()
        elif action == 2:
            reward = env.move_down()
        elif action == 3:
            reward = env.move_right()
        elif action == 4:
            reward = env.ingest()

        if render:
            env.render()

        i += 1

        time.sleep(speed)

    return success, env.consumed_count

In [None]:
# test(max_steps=250, speed=0.1, render=True)

In [None]:
from collections import Counter

actions = []
action_num_to_name = {0: "up", 1: "left", 2: "down", 3: "right", 4: "ingest"}

for val in training_dataset:
    actions.append(val[1])
    
action_distribution = dict(Counter(actions))
action_distribution = {action:1/count for action, count in action_distribution.items()}
print(action_distribution)