In [1]:
import torch
%load_ext tensorboard
from torch.utils.tensorboard import SummaryWriter
from torch import nn, optim
import numpy as np
import random
from dqn_model import Net
from agent import Agent1 as Agent
from experience_buffer import ExperienceBuffer
from config import HYPERPARAMS, ECONPARAMS, calc_loss, profit_gain, normalize_state
from calc_nash_monopoly import profit
from cont_bertrand import ContBertrand
import collections
import os.path
import time 
from static_tft import Amtft

In [2]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

params = HYPERPARAMS['full_obs_NB']
eparams = ECONPARAMS['base_case']

BATCH_SIZE = params['batch_size']
REPLAY_SIZE = params['replay_size']
REPLAY_START_SIZE = params['replay_start_size']
LEARNING_RATE = params['learning_rate']
SYNC_TARGET_FRAMES = params['sync_target_frames']
EPSILON_DECAY_LAST_FRAME = params['epsilon_decay_last_frame']
EPSILON_START = params['epsilon_start']
EPSILON_FINAL = params['epsilon_final']
nA = params['nA']
dO_a = params['dO_a']
FRAMES = params['frames']
NODES = params['nodes']
SEED = params['seed']
PATH = params['path']
GAMMA = params['gamma']
A0 = eparams['a0']
MU = eparams['mu']
FIRMLIST = eparams['firmlist']
print(params)

{'gamma': 0.95, 'batch_size': 64, 'replay_size': 400000, 'replay_start_size': 100000, 'learning_rate': 0.0001, 'sync_target_frames': 10000, 'epsilon_decay_last_frame': 1000000, 'epsilon_start': 1, 'epsilon_final': 0.01, 'nA': 30, 'dO': 6, 'dO_a': 4, 'frames': 2000000, 'seed': 1, 'path': 'checkpoint.pt', 'nodes': 8, 'p_end': 0.001, 'punishlen': 1}


In [3]:
# PyTorch setup
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu') # CPU when GPU is not available <-> device agnostic
print(device)

torch.manual_seed(SEED)
if use_cuda:
  torch.cuda.manual_seed(SEED)

# Neural network model:
net = Net(dO_a,nA, NODES).to(device)
print(net)
tgt_net = Net(dO_a,nA, NODES).to(device) # Prediction target.
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) 
buffer = ExperienceBuffer(REPLAY_SIZE)


# Reinforcement learning environment
firm0 = random.sample(FIRMLIST, 1)[0]
firm1 = random.sample(FIRMLIST, 1)[0]
env = ContBertrand(firm0, firm1)

# RL agent
agent = Agent(env, buffer, net, tgt_net, optimizer)

# Write output statistics to tensorboard
writer = SummaryWriter(comment = "-")

# Initialize variables
env.seed(SEED) # TODO; is this used?
torch.manual_seed(SEED)
frame_idx = 0
ep_idx = 0
epsilon = EPSILON_START

cpu
Net(
  (hidden): Linear(in_features=4, out_features=8, bias=True)
  (hidden2): Linear(in_features=8, out_features=8, bias=True)
  (output): Linear(in_features=8, out_features=30, bias=True)
)


In [None]:
# Training – Main loop
FRAMES = 2_000_000
# Training – Main loop

firm0 = random.sample(FIRMLIST, 1)[0]
firm1 = random.sample(FIRMLIST, 1)[0]

# Make econ variables
dict_key = str((firm0, firm1))
nash_action = eparams['nash_actions'][dict_key]
monopoly_action = eparams['monopoly_actions'][dict_key]
nash_profit = profit(nash_action, A0, MU, firm0, firm1, nA)
monopoly_profit = profit(monopoly_action, A0, MU, firm0, firm1, nA)

# Initiate new env and amTFT agent
s_next = env.reset(firm0, firm1)
amTFT = Amtft(nA, A0, MU, firm1, firm0, GAMMA)
done = False

for t in range(1, FRAMES):
    if done: # episode ends with probability gamma
        # Save episodal reward
        mean_pg = np.mean(agent.total_pg)
        writer.add_scalar("Agent_avg_profit", mean_pg, ep_idx)
        agent.total_pg = []
        ep_idx += 1
        # Randomize the firms
        firm0 = random.sample(FIRMLIST, 1)[0]
        firm1 = random.sample(FIRMLIST, 1)[0]
        # Make econ variables
        dict_key = str((firm0, firm1))
        nash_action = eparams['nash_actions'][dict_key]
        monopoly_action = eparams['monopoly_actions'][dict_key]
        nash_profit = profit(nash_action, A0, MU, firm0, firm1, nA)
        monopoly_profit = profit(monopoly_action, A0, MU, firm0, firm1, nA)
        
        # Initiate new env and amTFT agent
        s_next = env.reset(firm0, firm1)
        amTFT = Amtft(nA, A0, MU, firm1, firm0, GAMMA)
        done = False
        
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
    s = s_next
    
    action0 = agent.act(net, s[np.array([0,1,4,5])], epsilon, device = device.type)
    action1 = amTFT.act(s[1])
    s_next, reward_n, done, _ = env.step(action0, action1, amTFT.b)
    exp = Experience(s[np.array([0,1,4,5])], action0, reward_n[0], done, s_next[np.array([0,1,4,5])])
    agent.exp_buffer.append(exp)
    
    if reward_n is not None:
        reward = reward_n[0]
        pg = profit_gain(reward, nash_profit, monopoly_profit)[0] # important to index here
        agent.total_pg.append(pg)

    if len(agent.exp_buffer) < REPLAY_START_SIZE: 
        continue

    if frame_idx % SYNC_TARGET_FRAMES == 0: # Update target network
        agent.tgt_net.load_state_dict(agent.net.state_dict())
        
    batch = agent.exp_buffer.sample(BATCH_SIZE)
    agent.optimizer.zero_grad()
    loss_t = calc_loss(batch, agent.net, agent.tgt_net, device = device)
    loss_t.backward()
    # Gradient clipping
    for param in agent.net.parameters():
        param.grad.clamp_(-1, 1)
    agent.optimizer.step()
    writer.add_scalar("loss", loss_t, frame_idx)
    if frame_idx % 100_000 == 0:
      print(frame_idx)

    if frame_idx % 500_000 == 0:
        print(frame_idx)
        torch.save({
            'agent_state_dict': agent.net.state_dict(),
            'optimizer_dict': agent.optimizer.state_dict(),
            'epsilon': epsilon,
            'frame_idx': frame_idx,
            'env_state': s_next
            },  str(frame_idx) + PATH)
writer.close()

100000
200000
