## Bit Flipping with DQN + HER

  This is a plain implementation of DQN + HER from @orrivlin and modification of code for better readibility from udacity drlnd
  
  Changes will be made further
  * Soft Update
  * PER
  * DDQN
 

In [1]:
# Number of bits 
N = 5

# 1. Environment

Bit Flipping Environmnet 
N : Number of bits
S = {state vector, goal vector}

In [2]:
import torch
from copy import deepcopy as dc

In [3]:
device  = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
class BfEnv:
    def __init__(self, N):
        self.N = N    # Number of bits
        self.state_size = self.N*2
        self.action_size = self.N
        
    def reset(self):
        # Input : None
        # Output : state
        # Randomly generates a state tuple 
        state = torch.rand((1,self.N)).round()
        goal = torch.rand((1,self.N)).round()
        done = False
        return torch.cat((state, goal), dim=1), done
    
    def step(self, s, action):
        # Inputs: s, action
        # s : state, action : index number of action to be taken 
        # Output : (next_state, reward, done, dist)
        s[0, action] = 1.0 - s[0, action] #Taking action(a) and getting next_state(s_)
        r = -1.0 # Sparse reward penalty
        done = False
        
        # Calculate distance
        dist = (s[0,0:self.N] - s[0,self.N:]).abs().sum() 
        
        if dist == 0:
            done = True
            r = 0.0 # Sparse reward 
        return s, r, done, dist        

# 2. Model

Defining the Network 

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [6]:
class QNetwork(nn.Module):
    """ DQN (Policy) Network """
    def __init__(self, state_size, action_size, seed, fc1_units=128):
        """
        Initialize parameters and build model
        Parameters
        ====
            state_size (int): Dimension of state size
            action_size (int): Dimenstion of action size
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
            seed (int) : Random seed
        """ 
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, action_size)
    
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        return x

# 3. HER 
  The Agent sends the experiences to the HER, and HER changes the virtual goal to real goal in the experiences and adds to the replay buffer 

In [7]:
from collections import deque, namedtuple
import numpy as np
from copy import deepcopy
import random
from recordtype import *

In [8]:
class HER:
    """ To modify experiences in a single episode """
    def __init__(self, N):
        self.N = int(N)
        self.experience = recordtype("Experience", field_names=["s", "a", "r", "s_", "done"])
        self.buffer = deque()
        
    def reset(self):
        self.buffer = deque()
        
    def add(self, state, action, reward, next_state, done):
        """ Add a new experience to memory """
        e = self.experience(state, action, reward, next_state, done)
        self.buffer.append(e)
        
    def update(self):
        """
        Updating the virtual goals as real goals in all the experiences of an episode
        """
        her_buffer = dc(self.buffer)
        goal = her_buffer[-1].s_[0,0:self.N] # Taking s_ from last experience 
        for i in range(len(her_buffer)):
            her_buffer[i].s[0,self.N:] = goal  # Modify s
            her_buffer[i].s_[0,self.N:] = goal # Modify s_
            her_buffer[i].r = -1      # Modify r
            her_buffer[i].done = False    # Modify done
            if ((her_buffer[i].s_[0,0:5] - goal).abs().sum() == 0): # S_(state == goal)
                her_buffer[i].done = True
                her_buffer[i].r = 0.0
        return her_buffer

# 4. Replay Buffer 

In [9]:
class ReplayBuffer:
    """ Fixed size buffer to store experience tuple """
    
    def __init__(self, buffer_size, batch_size, seed = 0):
        """
        Params
        ====
            buffer_size(int) : maximum size of a buffer
            batch_size(int) : size of each training batch
            seed (int) : random seed
        """
        self.memory = deque() # deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = recordtype("Experience", field_names=["s", "a", "r", "s_", "done"])
        self.seed = random.seed(seed)
        
    def add(self, s, a, r, s_, done):
        """ Add a new experience to memory """
        e = self.experience(s, a, r, s_, done)
        self.memory.append(e)
    
    def sample(self, K):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, K)

        states = torch.cat([e.s for e in experiences]).to(device)
        actions = torch.cat([e.a for e in es]).view(K,-1).to(device)
        rewards = torch.tensor([e.r for e in experiences]).view(K, -1).to(device)
        next_states = torch.cat([e.s_ for e in experiences]).to(device)
        dones = torch.tensor([e.done for e in experiences]).float().view(K,-1).to(device)
  
        return (states, actions, rewards, next_states, dones)     

    def _len__(self):
        return len(self.memory)

In [10]:
#buffer = ReplayBuffer(buffer_size=int(1e-5), batch_size=N)

#  Logger

In [11]:
class logger:
    """ Using logger will help you to save all the data at one place and pass is as a single variable """
    def __init__(self):
        self.log = dict()
        
    def add_log(self,name):
        self.log[name] = []
        
    def add_item(self,name,x):
        self.log[name].append(x)
        
    def get_log(self,name):
        return self.log[name]
    
    def get_keys(self):
        return self.log.keys()
    
    def get_current(self,name):
        return self.log[name][-1]
    
    def get_latest_log(self, name, latest = 100):
        return self.log[name][(len(self.log(name))-latest):]

# Mean Value generator

In [12]:
class mean_val:
    def __init__(self):
        self.k = 0
        self.val = 0
        self.mean = 0
        
    def append(self,x):
        self.k += 1
        self.val += x
        self.mean = self.val/self.k
        
    def get(self):
        return self.mean

# 5. Agent

In [13]:
import numpy as np
import random
from collections import namedtuple, deque
import torch
import torch.nn.functional as F
import torch.optim as optim

In [15]:
BUFFER_SIZE = int(1e-5) # replay buffer size
BATCH_SIZE = 64         # mini-batch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 0.0001             # learning rate
UPDATE_EVERY = 4        # how often update the network

#device  = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
class Agent:
    """ Interacts with and learns with the environment """
    # DQN + HER    
    def __init__(self, state_size, action_size, seed):
        """
        Params
        ====
            state_size (int) : dimensions of each state
            action_size (int) : dimensions of each action
            seed (int) : random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.N = self.state_size/2
        self.update_target_step = 1000
        self.step_counter = 0
        
        # Q-Network 
        self.qnetwork_local = QNetwork(self.state_size, self.action_size, seed).to(device)  # local model
        self.qnetwork_target = QNetwork(self.state_size, self.action_size, seed).to(device) # target model                          # target model
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)                # optimizer

        # Replay Buffer - to store experiences 
        self.buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)
        
        # HER -Hindsight Experience Replay Buffer - to modify goal to a virtual goal
        self.her = HER(self.N)
        
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.update_every = 1000  # For hard update
        #self.update_every = 4     # For Soft update
        
        # Epsilon # Need to define in main rather than here
        self.epsilon = 0.1
        self.eps_max = 0.99
        self.eps_min = 0.05
        self.eps = self.eps_max # start eps from eps_max
        
    def act(self, state, eps):
        """ Returns action for given state as per current policy
        Params
        ======
            state (array_like): current state
            eps (float) : epsilon, for epsilon-greedy action selection
        """
        state = state.to(device)
        self.qnetwork_local.eval()  # evaluation mode
        with torch.no_grad():
            Q = self.qnetwork_local(state).to(device)
        self.qnetwork_local.train() # training mode
        
        rand_num = np.random.rand()
        if (rand_num < eps):  # Exploration
            a = torch.randint(0, Q.shape[1], (1,)).type(torch.LongTensor)
        else:                 # Exploitation
            a = torch.argmax(Q, dim=1)
        return a
    
    def step(self, s, a, r, s_, done):
        # Save experience in replay buffer 
        self.buffer.add(s, a, r, s_, done)
        
        # Save experience in her buffer
        self.her.add(s, a, r, s_, done)
        
        if len(self.buffer.memory) > BATCH_SIZE:
            # Learning every time step and update target model every update_every step  
            loss = self.learn()
            return loss
            
    def learn(self):  
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        num = len(self.buffer.memory)    # Check the length of replay buffer
        
        #K = np.min([num, BATCH_SIZE])    # Select the minimum of BATCH_SIZE or len(replay_buffer.memory)
       
        states, actions, rewards, next_states, dones = self.buffer.sample(K)
        
        # Get max predicted Q-values (for next_states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Qtargets for current states
        Q_targets = rewards + (GAMMA*Q_targets_next * (1-dones))
        
        # Get expected Q-values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        
        # learn
        loss = F.smooth_l1_loss(Q_expected.squeeze(),Q_targets.squeeze())
        #loss = F.mse_loss(Q_expected, Q_targets)
        
        # optimizer
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # ------------------- update target network ------------------- #
        #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)  
                
        self.step_counter += 1
        if (self.step_counter > self.update_target_step):
            self.hard_update(self.qnetwork_local, self.qnetwork_target)
            self.step_counter = 0
        return loss
        
        
    def hard_update():    
        self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())
        
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def her_update(self):
        her_buffer = self.her.update()
        for e in her_buffer:
            self.buffer.memory.append(e)

In [16]:
# Initialize env
env = BfEnv(5)

# Initialize agent
agent = Agent(10, 5, 0)

# 6. Main

In [17]:
def dqn_her(num_episodes=5000, N=5, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """ Deep Q-Learning + HER
    
    Params
    ======
        num_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    log = logger()
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    log.add_log('scores')
    log.add_log('episodes_loss')
    log.add_log('final_dist')
    mean_loss = mean_val()
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, num_episodes+1):
        state, _ = env.reset()
        score = 0
        min_dist = N
        for t in range(N):
            action = agent.act(state, eps)
            next_state, reward, done, dist = env.step(state, action)
            
            if dist < min_dist:
                min_dist = dist
                
            if (t+1) == N:  # Breaking the episode after N number of time steps
                done = True
                 
            loss = agent.step(state, action, reward, next_state, done)
            print("loss : ", loss)
            break
            mean_loss.append(loss) 
            state = next_state
            score += reward
            
            if done:
                break 
                
        agent.her_update()                # her update the experience trajectory
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        log.add_item('scores', score)     # save most recent score
        log.add_item('episodes_loss', mean_loss.get())
        log.add_item('final_dist', min_dist)
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}\tmin dist: {}'.format(i_episode, np.mean(scores_window), min_dist), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        
    return log

In [18]:
log = dqn_her(num_episodes=5000, N=5, eps_start=1.0, eps_end=0.01, eps_decay=0.995)
num_episodes = 5000

TypeError: '>' not supported between instances of 'collections.deque' and 'int'

In [None]:
def smooth(x,window_len=11,window='hanning'):
    if window_len<3:
        return x

    s=np.r_[x[window_len-1:0:-1],x,x[-2:-window_len-1:-1]]
    #print(len(s))
    if window == 'flat': #moving average
        w=np.ones(window_len,'d')
    else:
        w=eval('np.'+window+'(window_len)')

    y=np.convolve(w/w.sum(),s,mode='valid')
    return y


In [39]:
# Plotting

Y = np.asarray(log.get_log('scores'))
Y2 = smooth(Y)
x = np.linspace(0, len(Y), len(Y))
fig1 = plt.figure()
ax1 = plt.axes()
ax1.plot(x, Y, Y2)
plt.xlabel('episodes')
plt.ylabel('episode return')

Y = np.asarray(log.get_log('episodes_loss'))
Y2 = smooth(Y)
x = np.linspace(0, len(Y), len(Y))
fig2 = plt.figure()
ax2 = plt.axes()
ax2.plot(x, Y, Y2)
plt.xlabel('episodes')
plt.ylabel('average loss')

Y = np.asarray(log.get_log('final_dist'))
Y2 = smooth(Y)
x = np.linspace(0, len(Y), len(Y))
fig3 = plt.figure()
ax3 = plt.axes()
ax3.plot(x, Y, Y2)
plt.xlabel('episodes')
plt.ylabel('minimum distance')

Y = np.asarray(log.get_log('final_dist'))
Y[Y > 1] = 1.0
K = 100
Z = Y.reshape(int(num_epochs/K),K)
T = 1 - np.mean(Z,axis=1)
x = np.linspace(0, len(T), len(T))*K
fig4 = plt.figure()
ax4 = plt.axes()
ax4.plot(x, T)
plt.xlabel('episodes')
plt.ylabel('sucess rate')


NameError: name 'log' is not defined

In [40]:
buffer = ReplayBuffer(1e-5, 64, 0)

In [41]:
def get_e():
    s, _ = env.reset()
    a = torch.randint(low=0, high=5, size=(1,)).type(torch.LongTensor)
    s_, r, done, _ = env.step(s, a)
    return s, a, r, s_, done

In [42]:
for t in range(10):
    
    s, a, r, s_, done = get_e()
    if (t+1 == 5):
        done = True
    buffer.add(s, a, r, s_, done)
    #replay_buffer.append(([dc(s.squeeze(0).numpy()),dc(a),dc(r),dc(s_.squeeze(0).numpy()),dc(done)]))


In [43]:
buffer.memory

deque([Experience(s=tensor([[1., 0., 0., 0., 1., 1., 1., 0., 1., 1.]]), a=tensor([4]), r=-1.0, s_=tensor([[1., 0., 0., 0., 1., 1., 1., 0., 1., 1.]]), done=False),
       Experience(s=tensor([[1., 0., 1., 0., 1., 0., 0., 1., 0., 0.]]), a=tensor([4]), r=-1.0, s_=tensor([[1., 0., 1., 0., 1., 0., 0., 1., 0., 0.]]), done=False),
       Experience(s=tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 1.]]), a=tensor([0]), r=-1.0, s_=tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 1.]]), done=False),
       Experience(s=tensor([[1., 1., 0., 1., 0., 0., 1., 1., 1., 0.]]), a=tensor([0]), r=-1.0, s_=tensor([[1., 1., 0., 1., 0., 0., 1., 1., 1., 0.]]), done=False),
       Experience(s=tensor([[1., 1., 0., 0., 1., 1., 0., 0., 0., 1.]]), a=tensor([3]), r=-1.0, s_=tensor([[1., 1., 0., 0., 1., 1., 0., 0., 0., 1.]]), done=True),
       Experience(s=tensor([[0., 1., 0., 1., 0., 1., 1., 1., 1., 0.]]), a=tensor([3]), r=-1.0, s_=tensor([[0., 1., 0., 1., 0., 1., 1., 1., 1., 0.]]), done=False),
       Experience(s=ten

In [44]:
es = random.sample(buffer.memory, k=5)
es

[Experience(s=tensor([[1., 0., 1., 1., 1., 1., 0., 1., 1., 0.]]), a=tensor([3]), r=-1.0, s_=tensor([[1., 0., 1., 1., 1., 1., 0., 1., 1., 0.]]), done=False),
 Experience(s=tensor([[0., 0., 1., 0., 1., 1., 0., 0., 1., 1.]]), a=tensor([4]), r=-1.0, s_=tensor([[0., 0., 1., 0., 1., 1., 0., 0., 1., 1.]]), done=False),
 Experience(s=tensor([[1., 0., 0., 0., 1., 1., 1., 0., 1., 1.]]), a=tensor([4]), r=-1.0, s_=tensor([[1., 0., 0., 0., 1., 1., 1., 0., 1., 1.]]), done=False),
 Experience(s=tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 1.]]), a=tensor([0]), r=-1.0, s_=tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 1.]]), done=False),
 Experience(s=tensor([[1., 1., 0., 0., 1., 1., 0., 0., 0., 1.]]), a=tensor([3]), r=-1.0, s_=tensor([[1., 1., 0., 0., 1., 1., 0., 0., 0., 1.]]), done=True)]

In [45]:
[e.a for e in es]

[tensor([3]), tensor([4]), tensor([4]), tensor([0]), tensor([3])]

In [46]:
torch.tensor([e.a for e in es]).float().view(5,-1).to(device)

tensor([[3.],
        [4.],
        [4.],
        [0.],
        [3.]], device='cuda:0')

In [47]:
torch.cat([e.a for e in es]).view(5,-1).to(device)

tensor([[3],
        [4],
        [4],
        [0],
        [3]], device='cuda:0')

In [55]:
torch.cat([e.a for e in es]).view(5,-1).to(device)

tensor([[3],
        [4],
        [4],
        [0],
        [3]], device='cuda:0')

In [26]:
s = env.reset()
s

(tensor([[1., 0., 0., 0., 1., 0., 0., 0., 0., 1.]]), False)

In [27]:
s = s.to(device)

AttributeError: 'tuple' object has no attribute 'to'

In [61]:
len(buffer.memory)

5

In [73]:
buffer.memory

deque([Experience(s=tensor([[0., 0., 1., 1., 0., 1., 0., 1., 0., 1.]]), a=tensor([3]), r=-1.0, s_=tensor([[0., 0., 1., 1., 0., 1., 0., 1., 0., 1.]]), done=False),
       Experience(s=tensor([[1., 0., 0., 1., 0., 0., 0., 1., 1., 1.]]), a=tensor([2]), r=-1.0, s_=tensor([[1., 0., 0., 1., 0., 0., 0., 1., 1., 1.]]), done=False),
       Experience(s=tensor([[0., 1., 1., 1., 1., 1., 1., 1., 1., 0.]]), a=tensor([2]), r=-1.0, s_=tensor([[0., 1., 1., 1., 1., 1., 1., 1., 1., 0.]]), done=False),
       Experience(s=tensor([[1., 1., 1., 0., 1., 0., 0., 1., 0., 0.]]), a=tensor([3]), r=-1.0, s_=tensor([[1., 1., 1., 0., 1., 0., 0., 1., 0., 0.]]), done=False),
       Experience(s=tensor([[1., 0., 1., 1., 0., 1., 0., 0., 0., 1.]]), a=tensor([0]), r=-1.0, s_=tensor([[1., 0., 1., 1., 0., 1., 0., 0., 0., 1.]]), done=True)])

In [16]:
experiences = random.sample(buffer.memory, k = 5)

In [17]:
experiences

[Experience(s=tensor([[0., 0., 1., 0., 1., 1., 1., 1., 0., 0.]]), a=tensor([1]), r=-1.0, s_=tensor([[0., 0., 1., 0., 1., 1., 1., 1., 0., 0.]]), done=False),
 Experience(s=tensor([[1., 1., 0., 0., 1., 1., 0., 0., 1., 0.]]), a=tensor([2]), r=-1.0, s_=tensor([[1., 1., 0., 0., 1., 1., 0., 0., 1., 0.]]), done=False),
 Experience(s=tensor([[1., 1., 0., 0., 1., 0., 0., 1., 0., 1.]]), a=tensor([3]), r=-1.0, s_=tensor([[1., 1., 0., 0., 1., 0., 0., 1., 0., 1.]]), done=False),
 Experience(s=tensor([[1., 1., 0., 1., 1., 0., 0., 1., 1., 1.]]), a=tensor([0]), r=-1.0, s_=tensor([[1., 1., 0., 1., 1., 0., 0., 1., 1., 1.]]), done=False),
 Experience(s=tensor([[1., 0., 0., 0., 1., 0., 1., 1., 1., 0.]]), a=tensor([1]), r=-1.0, s_=tensor([[1., 0., 0., 0., 1., 0., 1., 1., 1., 0.]]), done=False)]

In [19]:
states = torch.from_numpy(np.vstack([e.s for e in experiences if e is not None])).float().to(device)
actions = torch.from_numpy(np.vstack([e.a for e in experiences if e is not None])).long().to(device)
rewards = torch.from_numpy(np.vstack([e.r for e in experiences if e is not None])).float().to(device)
next_states = torch.from_numpy(np.vstack([e.s_ for e in experiences if e is not None])).float().to(device)
dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

In [20]:
states

tensor([[0., 0., 1., 0., 1., 1., 1., 1., 0., 0.],
        [1., 1., 0., 0., 1., 1., 0., 0., 1., 0.],
        [1., 1., 0., 0., 1., 0., 0., 1., 0., 1.],
        [1., 1., 0., 1., 1., 0., 0., 1., 1., 1.],
        [1., 0., 0., 0., 1., 0., 1., 1., 1., 0.]])

In [21]:
actions

tensor([[1],
        [2],
        [3],
        [0],
        [1]])

In [22]:
rewards

tensor([[-1.],
        [-1.],
        [-1.],
        [-1.],
        [-1.]])

In [23]:
dones

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.]])

In [24]:
experiences

[Experience(s=tensor([[0., 0., 1., 0., 1., 1., 1., 1., 0., 0.]]), a=tensor([1]), r=-1.0, s_=tensor([[0., 0., 1., 0., 1., 1., 1., 1., 0., 0.]]), done=False),
 Experience(s=tensor([[1., 1., 0., 0., 1., 1., 0., 0., 1., 0.]]), a=tensor([2]), r=-1.0, s_=tensor([[1., 1., 0., 0., 1., 1., 0., 0., 1., 0.]]), done=False),
 Experience(s=tensor([[1., 1., 0., 0., 1., 0., 0., 1., 0., 1.]]), a=tensor([3]), r=-1.0, s_=tensor([[1., 1., 0., 0., 1., 0., 0., 1., 0., 1.]]), done=False),
 Experience(s=tensor([[1., 1., 0., 1., 1., 0., 0., 1., 1., 1.]]), a=tensor([0]), r=-1.0, s_=tensor([[1., 1., 0., 1., 1., 0., 0., 1., 1., 1.]]), done=False),
 Experience(s=tensor([[1., 0., 0., 0., 1., 0., 1., 1., 1., 0.]]), a=tensor([1]), r=-1.0, s_=tensor([[1., 0., 0., 0., 1., 0., 1., 1., 1., 0.]]), done=False)]

In [25]:
buffer.memory

deque([Experience(s=tensor([[1., 1., 0., 0., 1., 0., 0., 1., 0., 1.]]), a=tensor([3]), r=-1.0, s_=tensor([[1., 1., 0., 0., 1., 0., 0., 1., 0., 1.]]), done=False),
       Experience(s=tensor([[1., 1., 1., 0., 1., 0., 0., 0., 1., 0.]]), a=tensor([3]), r=-1.0, s_=tensor([[1., 1., 1., 0., 1., 0., 0., 0., 1., 0.]]), done=False),
       Experience(s=tensor([[1., 1., 0., 1., 1., 0., 0., 1., 1., 1.]]), a=tensor([0]), r=-1.0, s_=tensor([[1., 1., 0., 1., 1., 0., 0., 1., 1., 1.]]), done=False),
       Experience(s=tensor([[1., 1., 0., 1., 0., 1., 0., 0., 0., 0.]]), a=tensor([3]), r=-1.0, s_=tensor([[1., 1., 0., 1., 0., 1., 0., 0., 0., 0.]]), done=False),
       Experience(s=tensor([[1., 0., 0., 0., 1., 0., 1., 1., 1., 0.]]), a=tensor([1]), r=-1.0, s_=tensor([[1., 0., 0., 0., 1., 0., 1., 1., 1., 0.]]), done=False),
       Experience(s=tensor([[0., 0., 1., 1., 0., 1., 1., 1., 1., 0.]]), a=tensor([0]), r=-1.0, s_=tensor([[0., 0., 1., 1., 0., 1., 1., 1., 1., 0.]]), done=False),
       Experience(s=te

In [26]:
replay_buffer

deque([[array([1., 1., 0., 0., 1., 0., 0., 1., 0., 1.], dtype=float32),
        tensor([3]),
        -1.0,
        array([1., 1., 0., 0., 1., 0., 0., 1., 0., 1.], dtype=float32),
        False],
       [array([1., 1., 1., 0., 1., 0., 0., 0., 1., 0.], dtype=float32),
        tensor([3]),
        -1.0,
        array([1., 1., 1., 0., 1., 0., 0., 0., 1., 0.], dtype=float32),
        False],
       [array([1., 1., 0., 1., 1., 0., 0., 1., 1., 1.], dtype=float32),
        tensor([0]),
        -1.0,
        array([1., 1., 0., 1., 1., 0., 0., 1., 1., 1.], dtype=float32),
        False],
       [array([1., 1., 0., 1., 0., 1., 0., 0., 0., 0.], dtype=float32),
        tensor([3]),
        -1.0,
        array([1., 1., 0., 1., 0., 1., 0., 0., 0., 0.], dtype=float32),
        False],
       [array([1., 0., 0., 0., 1., 0., 1., 1., 1., 0.], dtype=float32),
        tensor([1]),
        -1.0,
        array([1., 0., 0., 0., 1., 0., 1., 1., 1., 0.], dtype=float32),
        False],
       [array([0., 0., 1.

In [42]:
samples = random.sample(replay_buffer, 5)

In [43]:
samples

[[array([1., 0., 0., 0., 1., 1., 0., 1., 0., 0.], dtype=float32),
  tensor([3]),
  -1.0,
  array([1., 0., 0., 0., 1., 1., 0., 1., 0., 0.], dtype=float32),
  False],
 [array([0., 1., 1., 0., 1., 1., 0., 1., 1., 1.], dtype=float32),
  tensor([2]),
  -1.0,
  array([0., 1., 1., 0., 1., 1., 0., 1., 1., 1.], dtype=float32),
  False],
 [array([1., 0., 1., 0., 0., 1., 1., 0., 0., 1.], dtype=float32),
  tensor([2]),
  -1.0,
  array([1., 0., 1., 0., 0., 1., 1., 0., 0., 1.], dtype=float32),
  False],
 [array([1., 0., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32),
  tensor([2]),
  -1.0,
  array([1., 0., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32),
  False],
 [array([1., 0., 1., 0., 0., 1., 0., 0., 1., 0.], dtype=float32),
  tensor([3]),
  -1.0,
  array([1., 0., 1., 0., 0., 1., 0., 0., 1., 0.], dtype=float32),
  False]]

In [44]:
S0, A0, R1, S1, D1 = zip(*samples)

In [45]:
S0

(array([1., 0., 0., 0., 1., 1., 0., 1., 0., 0.], dtype=float32),
 array([0., 1., 1., 0., 1., 1., 0., 1., 1., 1.], dtype=float32),
 array([1., 0., 1., 0., 0., 1., 1., 0., 0., 1.], dtype=float32),
 array([1., 0., 1., 0., 0., 1., 1., 1., 1., 1.], dtype=float32),
 array([1., 0., 1., 0., 0., 1., 0., 0., 1., 0.], dtype=float32))

In [46]:
S0 = torch.tensor(S0, dtype=torch.float)

In [47]:
S0

tensor([[1., 0., 0., 0., 1., 1., 0., 1., 0., 0.],
        [0., 1., 1., 0., 1., 1., 0., 1., 1., 1.],
        [1., 0., 1., 0., 0., 1., 1., 0., 0., 1.],
        [1., 0., 1., 0., 0., 1., 1., 1., 1., 1.],
        [1., 0., 1., 0., 0., 1., 0., 0., 1., 0.]])

In [48]:
A0 

(tensor([3]), tensor([2]), tensor([2]), tensor([2]), tensor([3]))

In [49]:
A0 = torch.tensor(A0, dtype=torch.long).view(5,-1)
A0

tensor([[3],
        [2],
        [2],
        [2],
        [3]])

In [50]:
R1 = torch.tensor(R1, dtype=torch.float).view(5,-1)
R1

tensor([[-1.],
        [-1.],
        [-1.],
        [-1.],
        [-1.]])

In [51]:
S1 = torch.tensor(S1 , dtype=torch.float)
S1

tensor([[1., 0., 0., 0., 1., 1., 0., 1., 0., 0.],
        [0., 1., 1., 0., 1., 1., 0., 1., 1., 1.],
        [1., 0., 1., 0., 0., 1., 1., 0., 0., 1.],
        [1., 0., 1., 0., 0., 1., 1., 1., 1., 1.],
        [1., 0., 1., 0., 0., 1., 0., 0., 1., 0.]])

In [52]:
D1 = torch.tensor(D1, dtype=torch.float)
D1

tensor([0., 0., 0., 0., 0.])

In [15]:
# logger

In [17]:
class logger:
    def __init__(self):
        self.log = dict()
        
    def add_log(self, name):
        self.log[name] = []
        
    def add_item(self, name, x):
        self.log[name].append(x)
        
    def get_log(self, name):
        return self.log[name]
    
    def get_keys(self):
        return self.log.keys()
    
    def get_current(self, name):
        return self.log[name][-1]

In [43]:
log = logger()
log.log

{}

In [59]:
log.add_log('scores_window')
for i in range(200):
    log.add_item('scores_window', i/50)

In [60]:
log.log

{'scores_window': [0.0,
  0.02,
  0.04,
  0.06,
  0.08,
  0.1,
  0.12,
  0.14,
  0.16,
  0.18,
  0.2,
  0.22,
  0.24,
  0.26,
  0.28,
  0.3,
  0.32,
  0.34,
  0.36,
  0.38,
  0.4,
  0.42,
  0.44,
  0.46,
  0.48,
  0.5,
  0.52,
  0.54,
  0.56,
  0.58,
  0.6,
  0.62,
  0.64,
  0.66,
  0.68,
  0.7,
  0.72,
  0.74,
  0.76,
  0.78,
  0.8,
  0.82,
  0.84,
  0.86,
  0.88,
  0.9,
  0.92,
  0.94,
  0.96,
  0.98,
  1.0,
  1.02,
  1.04,
  1.06,
  1.08,
  1.1,
  1.12,
  1.14,
  1.16,
  1.18,
  1.2,
  1.22,
  1.24,
  1.26,
  1.28,
  1.3,
  1.32,
  1.34,
  1.36,
  1.38,
  1.4,
  1.42,
  1.44,
  1.46,
  1.48,
  1.5,
  1.52,
  1.54,
  1.56,
  1.58,
  1.6,
  1.62,
  1.64,
  1.66,
  1.68,
  1.7,
  1.72,
  1.74,
  1.76,
  1.78,
  1.8,
  1.82,
  1.84,
  1.86,
  1.88,
  1.9,
  1.92,
  1.94,
  1.96,
  1.98,
  2.0,
  2.02,
  2.04,
  2.06,
  2.08,
  2.1,
  2.12,
  2.14,
  2.16,
  2.18,
  2.2,
  2.22,
  2.24,
  2.26,
  2.28,
  2.3,
  2.32,
  2.34,
  2.36,
  2.38,
  2.4,
  2.42,
  2.44,
  2.46,
  2.48,
  2.5,
 

In [61]:
log.get_current('scores_window')

3.98

In [62]:
log.get_log('scores_window')[(len(log.get_log('scores_window'))-10):]

[3.8, 3.82, 3.84, 3.86, 3.88, 3.9, 3.92, 3.94, 3.96, 3.98]

In [56]:
len(log.get_log('scores_window'))

200

In [21]:
log.add_log('tot_return')
log.log

{'tot_return': []}

In [22]:
log.add_log('avg_loss')
log.log

{'tot_return': [], 'avg_loss': []}

In [23]:
log.add_log('final_dist')
log.log

{'tot_return': [], 'avg_loss': [], 'final_dist': []}

In [24]:
log.add_item('avg_loss', 5)
log.log

{'tot_return': [], 'avg_loss': [5], 'final_dist': []}

In [26]:
log.add_item('avg_loss', 10)
log.log

{'tot_return': [], 'avg_loss': [5, 10, 10], 'final_dist': []}

In [27]:
log.get_log('avg_loss')

[5, 10, 10]

In [28]:
log.get_log('final_dist')

[]

In [30]:
class mean_log:
    def __init__(self):
        self.num = 0
        self.sum = 0
        self.mean = 0
        
    def append(self, x):
        self.num += 1
        self.sum += x
        self.mean = self.sum / self.num
        
    def get(self):
        return self.mean

In [32]:
log.get_keys()

dict_keys(['tot_return', 'avg_loss', 'final_dist'])

None
