# Import packages

In [1]:
import torch
from torch import nn, optim
import numpy as np
from tensorboardX import SummaryWriter
from dqn_model import Net
import time
from agent import Agent1 as Agent
from experience_buffer import ExperienceBuffer
from config import HYPERPARAMS
from config import ECONPARAMS
from config import calc_loss
from cont_bertrand import ContBertrand
from config import avg_profit_gain
ENV = ContBertrand()
import collections
import os.path
import time 
from static_tft import Tft
# TODO: standardize input, zero mean unit variance. Or subtract mean divide by range
# TODO: Run episodes. Reward agent discounted future cash flow of converging.

# Declare parameter values

In [2]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

params = HYPERPARAMS['full_obs_NB']
BATCH_SIZE = params['batch_size']
REPLAY_SIZE = params['replay_size']
REPLAY_START_SIZE = params['replay_start_size']
LEARNING_RATE = params['learning_rate']
SYNC_TARGET_FRAMES = params['sync_target_frames']
EPSILON_DECAY_LAST_FRAME = params['epsilon_decay_last_frame']
EPSILON_START = params['epsilon_start']
EPSILON_FINAL = params['epsilon_final']
nA = params['nA']
dO_a = params['dO_a']
FRAMES = params['frames']
SEED = params['seed']
PATH = params['path']
MONOPOLY_ACTION = ECONPARAMS['base_case']['monopoly_action']
NASH_ACTION = ECONPARAMS['base_case']['nash_action']
MONOPOLY_PROFIT = ECONPARAMS['base_case']['monopoly_profit']


# Initialize objects

In [3]:
# PyTorch setup
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu') # CPU when GPU is not available <-> device agnostic
print(device)

torch.manual_seed(SEED)
if use_cuda:
  torch.cuda.manual_seed(SEED)

# Neural network model:
net = Net(dO_a,nA).to(device)
tgt_net = Net(dO_a,nA).to(device) # Prediction target.
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) 
buffer = ExperienceBuffer(REPLAY_SIZE)

# Load pretrained nets and optimizers if they exist:
if os.path.exists(PATH+"nonsense"):
    checkpoint = torch.load(PATH)
    agent.net.load_state_dict(checkpoint['agent0_state_dict'])
    agent.optimizer.load_state_dict(checkpoint['optimizer0_dict'])
    frame_idx = checkpoint['frame_idx']
    epsilon = checkpoint['epsilon']
    s_next = checkpoint['env_state']

# Reinforcement learning environment
env = ENV
# RL agent
agent = Agent(env, buffer, net, tgt_net, optimizer)
# Static player # TODO need to convert actions to the same range as I'm in
static = Tft(MONOPOLY_PROFIT[0], MONOPOLY_ACTION[0], NASH_ACTION[0])

# Write output statistics to tensorboard
writer = SummaryWriter(comment = "-")

# Initialize variables
env.seed(SEED) # TODO; is this used?
torch.manual_seed(SEED)
frame_idx = 0
ts_frame = 0
ts = time.time()
s_next = env.reset()
epsilon = EPSILON_START

cpu


# Main loop


In [None]:
# Training – Main loop

time_start = time.time()
for t in range(1, FRAMES):
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
    s = s_next
    action0 = agent.act(net, s[np.array([0,1])], epsilon, device = device.type) # TODO: change obs space
    action1 = static.act(s[1])
    s_next, reward_n, done, _ = env.step(action0, action1, epsilon, frame_idx)
    static.last_profit = reward_n[1]
        
    exp = Experience(s_next[np.array([0,1])], action0, reward_n[0], done, s[np.array([0,1])])
    agent.exp_buffer.append(exp)
    
    if reward_n is not None:
        reward = reward_n[0]
        pg = avg_profit_gain(reward)
        agent.total_pg.append(pg)

        mean_pg = np.mean(agent.total_pg[-10000:])
        writer.add_scalar("Agent_avg_profit", mean_pg, frame_idx)
       # if agent.best_mean_pg is None or agent.best_mean_pg < mean_pg or frame_idx % (SYNC_TARGET_FRAMES) == 0:
        #    torch.save(agent.net.state_dict(),  "-best.dat") # TOOD, ever used?
            #if agent.best_mean_pg is not None:
             #   print("Best mean profit gain updated, %.1f: %.3f -> %.3f, model saved. Iteration: %.1f" % (a, agent.best_mean_pg, mean_pg, frame_idx))
            #agent.best_mean_pg = mean_pg
        if agent.length_opt_act > 25_000: # TODO: Don't hardcode
            print("Solved in %d frames!" % frame_idx)
            print(agent.length_opt_act)
            break
            
    if len(agent.exp_buffer) < REPLAY_START_SIZE: 
        continue

    if frame_idx % SYNC_TARGET_FRAMES == 0: # Update target network
        agent.tgt_net.load_state_dict(agent.net.state_dict())
        
    batch = agent.exp_buffer.sample(BATCH_SIZE)
    agent.optimizer.zero_grad()
    loss_t = calc_loss(batch, agent.net, agent.tgt_net, device = device)
    loss_t.backward()
    # Gradient clipping
    for param in agent.net.parameters():
        param.grad.clamp_(-1, 1)
    agent.optimizer.step()
    writer.add_scalar("loss", loss_t, frame_idx)
    
    if frame_idx % 100_000 == 0:
        print(frame_idx)
        torch.save({
            'agent_state_dict': agent.net.state_dict(),
            'optimizer_dict': agent.optimizer.state_dict(),
            'epsilon': epsilon,
            'frame_idx': frame_idx,
            'env_state': s_next
            },  "checkpoints/" + str(frame_idx) + PATH)
    
writer.close()
time_stop = time.time()
print(time_stop-time_start)

In [None]:
!tensorboard --logdir runs --host localhost
# http://localhost:6006/

In [None]:
agent.best_action