In [1]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import torch.autograd 
import torch.optim as optim 

import numpy as np 
import gym 
from collections import deque 
import random
import os

In [20]:
class Critic(nn.Module): 
    
    def __init__(self, input_dim, fc1_dim, fc2_dim, n_actions, name=None, chkpt="model"): 
        
        super(Critic, self).__init__() 
        
        self.input_dim = input_dim 
        self.fc1_dim = fc1_dim 
        self.fc2_dim = fc2_dim 
        self.n_actions = n_actions 
        
        # name of model to save
        if name is not None:
            if not os.path.exists(chkpt): 
                os.makedirs(chkpt)
            self.filename = os.path.join(chkpt, name +'_ddpg')
        
        self.fc1 = nn.Linear(*self.input_dim, self.fc1_dim)
        self.bn1 = nn.LayerNorm(self.fc1_dim)
        self.fc2 = nn.Linear(self.fc1_dim, self.fc2_dim)
        self.bn2 = nn.LayerNorm(self.fc2_dim)
        self.action_value = nn.Linear(self.n_actions,fc2_dim)
        self.q = nn.Linear(self.fc2_dim,1)
        
    def forward(self, state, action): 
        
        state_value = self.fc1(state)
        state_value = self.bn1(state_value)
        state_value = F.relu(state_value)
        state_value = self.fc2(state_value) 
        state_value = self.bn2(state_value) 
        action_value = F.relu(self.action_value(action))
        state_action_value = F.relu(torch.add(state_value,action_value))  
        state_action_value = self.q(state_action_value)
        return state_action_value 
    
    def init_weights(self): 
        
        f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
        f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
        f3 = 0.003
        
        torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
        torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
        torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
        torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
        torch.nn.init.uniform_(self.q.weight.data, -f3, f3)
        torch.nn.init.uniform_(self.q.bias.data, -f3, f3)
        
        
    def save_checkpoint(self):
        torch.save(self.state_dict(), self.filename)
        print("saving")

    def load_checkpoint(self):
        self.load_state_dict(torch.load(self.filename))
        
        


In [32]:
momo = Critic([3], 4, 5, 2, name=None, chkpt="model")

In [28]:
class newCritic(nn.Module): 
    
    def __init__(self, input_dim, hidden_layers_dims, n_actions, name=None, chkpt="model"): 
        
        super(newCritic, self).__init__() 
        
        # hidden_layers_dims is a list with dimension of all hidden layers.
        # number of hidden_layers is inferred by its length
        self.hidden_layers_dims = input_dim + hidden_layers_dims
        self.n_actions = n_actions 
        
        # name of model to save
        if name is not None:
            if not os.path.exists(chkpt): 
                os.makedirs(chkpt)
            self.filename = os.path.join(chkpt, name +'_ddpg')

        # hidden_layers are linear + layernorm. 
        self.hidden_layers = []
        for dim_in, dim_out in zip(self.hidden_layers_dims[:-1],self.hidden_layers_dims[1:]): 
            self.hidden_layers.append( nn.Linear( dim_in, dim_out) )
            self.hidden_layers.append( nn.LayerNorm(dim_out) )
            self.hidden_layers.append( F.relu )

        last_hidden_layer_dim = self.hidden_layers_dims[-1]

        # a linear layer is constructed from the actions directly to the last hidden layer 
        self.action_value = nn.Linear(self.n_actions, last_hidden_layer_dim)
                                      
        # this is the final layer which returns the quality function q(s,a)
        self.q = nn.Linear(last_hidden_layer_dim,1)
        
    def forward(self, state, action): 
        
        # first "branch" from input state to final hidden layer
        state_value = state
        for hidden_layer in self.hidden_layers:
            state_value = hidden_layer(state_value)
        
        # second "branch" from action to final hidden layer
        action_value = F.relu(self.action_value(action))

        # merge of the two branches
        state_action_value = F.relu(torch.add(state_value,action_value))
        
        # evaluation of q(s,a)
        state_action_value = self.q(state_action_value)
        return state_action_value 
    
    # q is initialized to smaller values than what "suggested" by the 1/sqrt(input_dim) rule
    def init_weights(self): 
        
        init_weights_q = 0.003
        torch.nn.init.uniform_(self.q.weight.data, -init_weights_q, init_weights_q)
        torch.nn.init.uniform_(self.q.bias.data,   -init_weights_q, init_weights_q)
        
    # saves checkpoint for the model
    def save_checkpoint(self):
        torch.save(self.state_dict(), self.filename)
        print("saving")

    # loads checkpoint for the model
    def load_checkpoint(self):
        self.load_state_dict(torch.load(self.filename))
        
        


In [45]:
newmomo = newCritic([3], [4, 5,10,2,4], 2, name=None, chkpt="model")

In [46]:
momo.fc1.weight.data.dtype

torch.float32

In [47]:
x.dtype, a.dtype

(torch.float32, torch.float32)

In [48]:
x = torch.tensor([0.,-0.2,0.3])
a = torch.tensor([1.,0.])
momo(x,a)

tensor([0.0172], grad_fn=<ViewBackward0>)

In [49]:
newmomo(x,a)

tensor([1.1539], grad_fn=<ViewBackward0>)

# NEW ACTOR

In [34]:


class Actor(nn.Module):
    
    def __init__(self, input_dim, hidden_layers_dims, n_actions, name=None, chkpt="model"): 
        
        super(Actor, self).__init__() 
            
        # hidden_layers_dims is a list with dimension of all hidden layers.
        # number of hidden_layers is inferred by its length
        self.hidden_layers_dims = input_dim + hidden_layers_dims
        self.n_actions = n_actions 
        
        # name of model to save
        if name is not None:
            if not os.path.exists(chkpt): 
                os.makedirs(chkpt)
            self.filename = os.path.join(chkpt, name +'_ddpg')

        # hidden_layers are linear + layernorm. 
        self.hidden_layers_list = []
        for dim_in, dim_out in zip(self.hidden_layers_dims[:-1],self.hidden_layers_dims[1:]): 
            self.hidden_layers_list.append( nn.Linear( dim_in, dim_out) )
            self.hidden_layers_list.append( nn.LayerNorm(dim_out) )
            self.hidden_layers_list.append( nn.ReLU() )

        last_hidden_layer_dim = self.hidden_layers_dims[-1]
        # a linear layer is constructed from the last hidden layer to the action  
        self.hidden_layers_list.append( nn.Linear(last_hidden_layer_dim, self.n_actions) )
        
        # creates a torch Module, to make parameters visible to torch optimizer
        self.hidden_layers = nn.Sequential(*self.hidden_layers_list)

    def forward(self,state):
        
        # from input state to final hidden layer to squashed mu
        x = torch.tanh( self.hidden_layers( state) )
        return x
    
    # q is initialized to smaller values than what "suggested" by the 1/sqrt(input_dim) rule
    def init_weights(self): 
        
        init_weights_mu = 0.003
        torch.nn.init.uniform_(self.hidden_layers_dims[-1].weight.data, -init_weights_mu, init_weights_mu)
        torch.nn.init.uniform_(self.hidden_layers_dims[-1].bias.data,   -init_weights_mu, init_weights_mu)
        
        
    # saves checkpoint for the model
    def save_checkpoint(self):
        torch.save(self.state_dict(), self.filename)
        print("saving")

    # loads checkpoint for the model
    def load_checkpoint(self):
        self.load_state_dict(torch.load(self.filename))

In [35]:
momo = Actor([3], [4, 5,10,2,4], 2, name=None, chkpt="model")

In [36]:
x = torch.tensor([0.,-0.2,0.3])
momo(x)

tensor([-0.5513, -0.5565], grad_fn=<TanhBackward0>)

# New DDPGAgent

In [None]:
class DDPGagent: 
    """
    Initializes an agent which uses the Deep Deterministic Policy Gradient agent 
    """
    def __init__(self, 
                 env, 
                 hidden_layers_dims,
                 replay_min=100,
                 replay_size=1000000,
                 critic_lr=0.00015, 
                 actor_lr=0.000015, 
                 tau =0.001, 
                 gamma=0.99,
                 loss=nn.MSELoss(), 
                 batch_size=64, 
                 name_critic=None, 
                 name_actor=None, 
                 device = "cpu",
                 directory = "models"):
        
        # "reads" the environment and sets
        self.env = env 
        self.input_dim = env.observation_space.shape 
        self.n_actions = env.action_space.shape[0]
        self.tau = tau 
        self.device = device
        self.gamma = gamma 
        
        # 
        self.batch_size = batch_size 
        self.memory= Replay_Memory(replay_size)
        self.replay_min = replay_min
        
        # sets the names used for folder creation and checkpoints saves
        self.name_critic = name_critic
        self.name_actor = name_actor 
        
        # creates critic
        self.critic = Critic(self.input_dim, hidden_layers_dims, self.n_actions, name=name_critic, chkpt=directory).to(device)
        name_target_critic = None
        
        if name_critic is not None: 
            name_target_critic = name_critic + "_target"
        
        # creates target critic
        self.target_critic = Critic(self.input_dim, hidden_layers_dims, self.n_actions, self.n_actions, name = name_target_critic,chkpt=directory).to(device)
        
        
        # creates actor
        self.actor = Actor(self.input_dim, layer1_size, layer2_size, self.n_actions, name = name_actor,chkpt=directory).to(device)
        
        name_target_actor = None 
        if name_actor is not None: 
                name_target_actor = name_actor + "_target"
            
        # creates target actor
        self.target_actor = Actor(self.input_dim, layer1_size, layer2_size, self.n_actions, name = name_target_actor,chkpt=directory).to(device)
        
        # initialization of weights (possibly redundant)
        self.critic.init_weights()
        self.actor.init_weights()

        # initialization of weights (possibly redundant)
        self.update_target_weights()
        
        self.critic_criterion = loss 
        self.actor_criterion = loss  
        
        # initialization of optimizers
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),lr=critic_lr,weight_decay=0.01)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),lr=actor_lr)
    
    # explict decay of learning rates
    def update_critic_optimizer(self, learning_rate):
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),lr=learning_rate)
        
    def update_actor_optimizer(self, learning_rate): 
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),lr=learning_rate)
        
    
    def update_replay_memory(self,state, action, reward, next_state, done): 
        """ Adds single experience to the memory buffer.
        Receives s,a,r,s',done
        """
        self.memory.push(state, action, reward, next_state, done)    

        
    def update_target_weights(self,tau=1): 
        """ Soft-update of the target networks towards the current (learned) network.
        """
        
        for target_param, param in zip(self.target_critic.parameters(),self.critic.parameters()): 
            target_param.data.copy_(param.data * self.tau + target_param.data *(1.0 - tau))
            
        for target_param, param in zip(self.target_actor.parameters(),self.actor.parameters()): 
            target_param.data.copy_(param.data * self.tau + target_param.data *(1.0 - tau))
            
            
    def get_action(self, observation): 
        """ From state (observation) to the deterministic (+noise) action
        """
        self.actor.eval()  #because I have batch norm 
        
        observation = torch.tensor(observation, dtype= torch.float).to(self.device)
        actor_action = self.actor(observation)
        action = actor_action.cpu().detach().numpy()  
        
        return action 
    
    
    def train(self): 
        
        # training starts only after some sampling has been done
        if len(self.memory) <  self.replay_min:
            return 
        
        # randomly sampled experience from past.
        states, actions, rewards, next_states, not_done = self.memory.sample(self.batch_size)
        
        states = torch.tensor(np.array(states), dtype = torch.float).to(self.device)
        actions = torch.tensor(np.array(actions), dtype = torch.float).to(self.device)
        rewards = torch.tensor(np.array(rewards), dtype = torch.float).to(self.device)
        next_states = torch.tensor(np.array(next_states), dtype = torch.float).to(self.device)
        not_done = torch.tensor(np.array(not_done)).unsqueeze(1).to(self.device)
        
        # 
        self.actor.eval()     
        self.critic.eval() 
        

        self.target_actor.eval() 
        self.target_critic.eval() 
        
        # a' = mu(s')
        # target q -> q(s', mu)
        target_actions = self.target_actor.forward(next_states)
        target_critic_value = self.target_critic(next_states, target_actions) 
    
        # Q^exp = r + gamma Q(s',a')
        targets = rewards + self.gamma*not_done*target_critic_value
        targets.to(self.device)          
        
        self.critic.train()
        self.critic_optimizer.zero_grad()
        critic_value = self.critic.forward(states, actions)
        
        # if MSE, loss = (r + gamma Q(s', a') - Q(s,a))^2
        loss = self.critic_criterion(critic_value, targets)
        loss.backward() 
        
        self.critic_optimizer.step() 
        self.critic.eval() 
        
        self.actor_optimizer.zero_grad() 
        self.actor.train() 
        
        mu = self.actor.forward(states)
        
        # if MSE, loss =  - Q(s, mu(s))^2 
        #     --> grad ~ -Q(s,mu) grad mu(s)
        actor_loss = -self.critic.forward(states,mu)
        actor_loss = torch.mean(actor_loss)
        actor_loss.backward()
        self.actor_optimizer.step()
        self.update_target_weights(self.tau)
        
    def save_model(self):
        """ Saves all models' checkpoints in folder given by names
        """
        self.actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_model(self):
        """ Loads all models' checkpoints in folder given by names
        """
        self.actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.target_critic.load_checkpoint()
        
        
        
        

In [15]:
f = open("../../wavelength_cards.txt", "r")
file_data = f.read()
list_cards = file_data.splitlines()

In [16]:
import random

@bot.message_handler(commands=['carta'])
def send_welcome(message):
    index_card = random.randint(len(list_cards))
    card = list_cards.pop(index_card)
    bot.reply_to(message, "Eccoti: \n"+card)

TypeError: 'module' object is not callable

In [17]:
input = (3,)

# NEW TD3

In [None]:
class CriticNetwork(nn.Module): 
        
    def __init__(self, input_dims, hidden_layers_dims, n_actions, name= None, chkpt_dir = "save_m_2"): 
        
        super(CriticNetwork, self).__init__ ()
        
        self.name = name
        # name of model to save
        if name is not None:
            if not os.path.exists(chkpt): 
                os.makedirs(chkpt)
            self.checkpoint_file= os.path.join(chkpt_dir,name +'_td3) 
            
        
        # number of hidden_layers is inferred by its length
        self.hidden_layers_dims = list(input_dim) + hidden_layers_dims
        self.n_actions = n_actions 
        
        self.name = name 
        
        #self.checkpoint_dir = chkpt_dir 
        
        #self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_td3') 
        
        self.fc1 = nn.Linear(self.input_dims[0]+n_actions, self.fc1_dims) 
        
        self.fc2 = nn.Linear(self.fc1_dims+n_actions, self.fc2_dims) 
        
        self.q1 = nn.Linear(self.fc2_dims,1) #scalar value of the critic (state-action value) 
        
        
        
      
        
    def forward(self, state, action): 
    
        q1_action_value = self.fc1(T.cat([state,action],dim=1))
        
        q1_action_value = F.relu(q1_action_value) 
        
        #q1_action_value = self.fc2(q1_action_value)
        
        q1_action_value = self.fc2(T.cat([q1_action_value,action],dim=1))
        
        q1_action_value = F.relu(q1_action_value) 
        
        q1 = self.q1(q1_action_value) 
        
        return q1 
        
    def save_checkpoint(self): 
        
        if self.name is not None:
    
            print("...saving...") 
        
            T.save(self.state_dict(),self.checkpoint_file)
        
        
    def load_checkpoint(self): 
    
        if self.name is not None:
    
            print("..loading...") 
        
            self.load_state_dict(T.load(self.checkpoint_file)) 