In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import torch.nn.utils as torch_utils
from torch.nn.functional import one_hot
import matplotlib.pyplot as plt
import random

In [2]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.ln1 = nn.LayerNorm(hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        
        nn.init.kaiming_uniform_(self.fc1.weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc2.weight, nonlinearity='relu')

    def forward(self, x):
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.ln2(out)
        out = self.relu(out)    
        out = self.fc3(out)
        return nn.functional.softmax(out, dim=-1).squeeze()

class Memory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, command, action, reward):
        if len(self.memory) >= self.capacity:
               self.memory.pop(0)
        self.memory.append((command, action, reward))

    def sample(self, batch_size):
        return random.sample(self.memory, min(len(self.memory), batch_size))
        if len(self.memory) < 2:
            return []
        if len(self.memory) >= 2*batch_size:
            return random.sample(self.memory, batch_size)
        return random.sample(self.memory, int(len(self.memory)/2))

In [3]:
def cmd_to_tensor(cmd):
    return one_hot(torch.tensor(cmd), 4).float()

def get_move(command):
    with torch.no_grad():
        output = model(cmd_to_tensor(command))
        print(f"""
        0: {model(cmd_to_tensor(0))}
        1: {model(cmd_to_tensor(1))}
        2: {model(cmd_to_tensor(2))}
        3: {model(cmd_to_tensor(3))}
        """)
        return torch.multinomial(output, 1).item()
    
def _get_reward_and_lr(move, model_output, good):
    p = model_output[move]
    
    if p > 0.9 and good:
        return None, None
    # print(f"""
    # {p}
    # {move}
    # {good}
    # """)
    min_c = 1e-6
    max_c = 0.05
    m = max_c-1*min_c
    q = min_c
    if good:
        m = -1*m
        q = max_c
    lr = m*p + q
    
    min_c = 1
    max_c = 100
    if not good:
        min_c = -10
        max_c = -1
    m = max_c-1*min_c
    q = min_c
    if good:
        m = -1*m
        q = max_c
    reward = m*p + q
    
    # print(reward.item(), lr.item())
    return reward.item(), lr.item()
    return reward.item(), 0.05
    
def _base_train(command, move, good, is_from_memory):
    command_t = cmd_to_tensor(command)
    output_prob = model(command_t)
    
    move_t = torch.tensor(move).long()
    if is_from_memory and output_prob[move_t]  >= 0.6:
        return
    
    reward, lr = _get_reward_and_lr(move, output_prob, good)
    if reward is None:
        return
    reward_t = torch.tensor(reward).float()
    
    
    loss = -torch.log(output_prob[move_t])*reward_t
    
    for param_group in optimizer.param_groups:
            param_group['lr'] = lr
    
    optimizer.zero_grad()
    loss.backward()
    torch_utils.clip_grad_norm_(model.parameters(), max_norm=0.8)
    optimizer.step()

def replay_memory():
    for command, move, reward in memory.sample(100):
        _base_train(command, move, reward, True)
        
def train(command, move, good):
    _base_train(command, move, good, False)
    replay_memory()
    memory.push(command, move, good)

device = torch.device('cpu')
model = Net(4, 4, 4).to(device)
memory = Memory(100)
# optimizer = optim.Adam(model.parameters(), lr=0.1)
optimizer = optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9)

In [16]:
train_data = [3]*5

for c in train_data:
    m = get_move(c)
    print(f"{c} -> {m}")
    reward = False
    if m == c:
        print(f"goood")
        reward = True
    train(c, m, reward)


        0: tensor([0.7522, 0.0137, 0.2091, 0.0250])
        1: tensor([0.0127, 0.7205, 0.0085, 0.2583])
        2: tensor([0.0830, 0.0022, 0.6914, 0.2234])
        3: tensor([0.0625, 0.1141, 0.1055, 0.7179])
        
3 -> 3
goood

        0: tensor([0.7516, 0.0135, 0.2096, 0.0253])
        1: tensor([0.0098, 0.7308, 0.0053, 0.2540])
        2: tensor([0.0676, 0.0013, 0.6626, 0.2685])
        3: tensor([0.0464, 0.0688, 0.0660, 0.8188])
        
3 -> 1

        0: tensor([0.6839, 0.0141, 0.2764, 0.0255])
        1: tensor([0.0655, 0.6168, 0.0255, 0.2921])
        2: tensor([0.0612, 0.0011, 0.6644, 0.2733])
        3: tensor([0.0361, 0.0482, 0.0435, 0.8721])
        
3 -> 2

        0: tensor([0.6855, 0.0154, 0.2766, 0.0225])
        1: tensor([0.0109, 0.7586, 0.0038, 0.2266])
        2: tensor([4.3794e-02, 6.5618e-04, 6.8705e-01, 2.6850e-01])
        3: tensor([0.0364, 0.0386, 0.0402, 0.8849])
        
3 -> 3
goood

        0: tensor([0.6831, 0.0167, 0.2767, 0.0235])
        1: tensor([

In [7]:
print(f"""
0: {model(cmd_to_tensor(0))}
1: {model(cmd_to_tensor(1))}
2: {model(cmd_to_tensor(2))}
3: {model(cmd_to_tensor(3))}
""")


0: tensor([0.4234, 0.2938, 0.2003, 0.0825], grad_fn=<SqueezeBackward0>)
1: tensor([0.4354, 0.2907, 0.2006, 0.0733], grad_fn=<SqueezeBackward0>)
2: tensor([0.4323, 0.2916, 0.2005, 0.0755], grad_fn=<SqueezeBackward0>)
3: tensor([0.4345, 0.2910, 0.2006, 0.0740], grad_fn=<SqueezeBackward0>)

