## Imports

In [1]:
import gym
import argparse

import torch
import torch.optim as optim

from tensorboardX import SummaryWriter

from src import actions, agents, runner, common, wrapper, runner
from src.models import dqn_model
from src.common import hyperparameters, logger
from src.memory import ExperienceReplayBuffer

## Parameters

In [2]:
#CONFIG
params = hyperparameters.PARAMS['pong']
# parser = argparse.ArgumentParser()
# parser.add_argument("--cuda", default=False, action="store_true", help="Enable Cuda")
# args = parser.parse_args()
device = torch.device("cpu") #cuda or cpu

## Environment

In [3]:
#INIT ENV
env = gym.make(params['env_name'])
env = wrapper.wrap_dqn(env)


## Logging

In [4]:
#LOGGING
writer = SummaryWriter(comment="-" + params['run_name'] + "-basic")

## Network

In [5]:
#NETWORK
net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = agents.TargetNetwork(net)


## Agent

In [6]:
#AGENT
selector = actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
epsilon_tracker = logger.EpsilonTracker(selector, params)
agent = agents.DQNAgent(net, selector, device=device)

## Runner

In [7]:
#RUNNER
exp_source = runner.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'],steps_count=1)
buffer = ExperienceReplayBuffer(exp_source,buffer_size=params['replay_size'])
optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

frame_idx = 0

## Training 

In [8]:
#TRAIN	
with logger.RewardTracker(writer, params['stop_reward']) as reward_tracker:
    while True:
        frame_idx += 1
        buffer.populate(1)
        epsilon_tracker.frame(frame_idx)

        new_rewards = exp_source.pop_total_rewards()
        if new_rewards:
            if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
                break

        if len(buffer) < params['replay_initial']:
            continue

        #learning step
        optimizer.zero_grad()
        batch = buffer.sample(params['batch_size'])
        loss_v = agent.calc_loss(batch, net, tgt_net.target_model,params['gamma'],device)
        loss_v.backward()
        optimizer.step()

        if frame_idx % params['target_net_sync'] == 0:
            tgt_net.sync()


763: done 1 games, mean reward -21.000, speed 328.31 f/s, eps 0.99
1553: done 2 games, mean reward -21.000, speed 301.98 f/s, eps 0.98
2358: done 3 games, mean reward -21.000, speed 330.41 f/s, eps 0.98
3236: done 4 games, mean reward -21.000, speed 319.46 f/s, eps 0.97
4113: done 5 games, mean reward -21.000, speed 311.76 f/s, eps 0.96
4925: done 6 games, mean reward -21.000, speed 317.39 f/s, eps 0.95
5825: done 7 games, mean reward -20.857, speed 326.71 f/s, eps 0.94
6665: done 8 games, mean reward -20.750, speed 327.30 f/s, eps 0.93
7572: done 9 games, mean reward -20.778, speed 325.09 f/s, eps 0.92
8361: done 10 games, mean reward -20.800, speed 326.11 f/s, eps 0.92
9245: done 11 games, mean reward -20.818, speed 326.07 f/s, eps 0.91
10061: done 12 games, mean reward -20.833, speed 82.58 f/s, eps 0.90


KeyboardInterrupt: 