# Entraînement du réseau de neurones via Transfer Learning de CamemBERT
### Avant de commencer, quelques bonnes pratiques lorsqu'on commence à modéliser un réseau via torch

1. Vérifier que l'on indique bien le suivi des inputs dans nos modèles
2. Vérifier que le gradient vient bien modifier les poids
3. Vérifier que la fonction de perte se re-propage correctement

In [13]:
#!pip install torch
#!pip install sklearn
import torch.nn as nn
import sklearn
import torch
import pickle
import os
from tqdm import tqdm
from joblib import Parallel, delayed
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.optim as optim

torch.cuda.device_count()

2

In [None]:
dico_train=pickle.load(open('dico_train_1.pickle','rb'))

train_input_ids=dico_train['input']
train_mask=dico_train['mask']
clss=dico_train['clss']
train_mask_cls=dico_train['mask_cls']
train_output=dico_train['output']
#clss_index_train=[len(i) for i in dico_train['clss']]

In [None]:
ouais=torch.as_tensor([(train_output[i]!=torch.tensor(0)).nonzero().size()[0] for i in range(len(train_output))])
v=((train_mask_cls.sum(dim=1)>ouais)==True).nonzero()

def correct_mask_cls(input_ids):
    vec=(torch.as_tensor(input_ids)==torch.tensor(5)).nonzero()
    mask=torch.zeros(torch.as_tensor(input_ids).size())
    mask[vec]=1
    return mask

cpu_max=30
train_mask_cls_2=Parallel(cpu_max)(delayed(correct_mask_cls)(train_input_ids[i]) for i in range(len(train_input_ids)))
train_mask_cls_2=torch.stack(train_mask_cls_2)

v=((train_mask_cls_2.sum(dim=1)>ouais)==True).nonzero()

import numpy as np
np.sum([int(train_mask_cls_2[i].sum())==(train_output[i]!=torch.tensor(0)).nonzero().size(0) for i in range(len(train_output))])/len(train_output)

In [None]:
out=torch.zeros(train_mask_cls_2.shape,dtype=torch.float64)

x=(train_output!=torch.tensor(0)).nonzero()
dim_1=torch.unique(torch.stack([x[i][0] for i in range(len(x))]))

x_2=torch.index_select(x,1,torch.tensor(1)).reshape(-1)
x_1=(x_2==0).nonzero()

dim_2=[]
from tqdm import tqdm
for k in tqdm(range(len(x_1))):
    if k<(len(x_1)-1):
        dim_2.append(x_2[x_1[k]:x_1[k+1]])
    else:
        dim_2.append(x_2[x_1[k]:])
    
for k in tqdm(range(len(dim_1))):
    out[k,(train_mask_cls_2[k]!=torch.tensor(0)).nonzero().squeeze(1)]=train_output[dim_1[k],dim_2[k]]

In [None]:
train_dataset = TensorDataset(
    torch.tensor(train_input_ids),
    torch.tensor(train_mask),
    clss,
    train_mask_cls_2,
    out)

pickle.dump(train_dataset,open('train_dataset.pickle','wb'))

In [2]:
train_dataset=pickle.load(open('train_dataset_1.pickle','rb'))

## Modèle :
On crée nos différents modèles

In [3]:
class Simple_Classifier(nn.Module):
    def __init__(self, hidden_size):
        super(Simple_Classifier, self).__init__()
        self.linear1 = nn.Linear(hidden_size, 1)
        self.relu=nn.LeakyReLU(negative_slope= 0.01)

    def forward(self, x, mask_cls=None):
        x.requires_grad_(True)
        h = self.linear1(x).squeeze(-1)
        if mask_cls!=None:
            h=torch.mul(h,mask_cls)
        sent_scores = self.relu(h) #* mask_cls.float()
        return sent_scores.squeeze(-1)

import torch.nn.functional as F

class Multi_Linear_Classifier(nn.Module):
    def __init__(self, hidden_size):
        super(Multi_Linear_Classifier, self).__init__()
        self.linear1 = nn.Linear(hidden_size, int(hidden_size/2))
        self.linear2 = nn.Linear(int(hidden_size/2),int(hidden_size/6))
        self.linear3 = nn.Linear(int(hidden_size/6),1)
        self.Lrelu=nn.LeakyReLU(negative_slope= 0.01)
        self.softmax=nn.Softmax(dim=-1)


    def forward(self, x,mask_cls=None):#, mask_cls):
        x.requires_grad_(True)
        h = self.linear1(x).squeeze(-1)
        if mask_cls!=None:
            h=torch.mul(h,mask_cls)
        h = self.softmax(h)#self.Lrelu(h) #* mask_cls.float()
        h = self.linear2(h)
        h = self.softmax(h)#self.Lrelu(h)
        h = self.linear3(h)
        #h = self.softmax(h)#self.Lrelu(h)
        return h.squeeze(-1)
    

class SMHA_classifier(nn.Module):
    def __init__(self, size,nhead):
        super(SMHA_classifier, self).__init__()
        self.MHA = nn.MultiheadAttention(size[1], nhead)
        self.LReLu=nn.LeakyReLU(negative_slope= 0.01)
        self.sigmoid = nn.Sigmoid()
        self.LN=nn.LayerNorm(size)

    def forward(self, x, mask_cls=None):
        x.requires_grad_(True)
        h,weights = self.MHA(x,x,x)
        if mask_cls!=None:
            h=torch.mul(h,mask_cls)
        normalized_h=self.LN(h)
        sent_scores = self.LReLu(normalized_h) #* mask_cls.float()
        return sent_scores.mean(dim=2)
    
class SMHA_Linear_classifier(nn.Module):
    def __init__(self, size,nhead,hidden_size):
        super(SMHA_Linear_classifier, self).__init__()
        self.MHA = nn.MultiheadAttention(size[1], nhead)
        self.LReLu=nn.LeakyReLU(negative_slope= 0.01)
        self.sigmoid = nn.Sigmoid()
        self.LN=nn.LayerNorm(size)
        self.linear1 = nn.Linear(hidden_size, int(hidden_size/2))
        self.linear2 = nn.Linear(int(hidden_size/2),int(hidden_size/6))

    def forward(self, x, mask_cls=None):
        x.requires_grad_(True)
        h,weights = self.MHA(x,x,x)
        h=self.LN(h)
        h=self.linear1(h)
        h=self.LReLu(h)
        h=self.linear2(h)
        if mask_cls!=None:
            h=torch.mul(h,mask_cls)
        sent_scores = self.LReLu(h) #* mask_cls.float()
        return sent_scores.mean(dim=2) 
    
def select_sent(phrase,clss,K=3):
    index_phrase=torch.topk(phrase,K)[1]
    pred_phrase=torch.zeros(clss.shape)
    index_1=[[clss[k].tolist().index(int(index_phrase[k][i])) 
                for i in range(len(index_phrase[k])) if int(index_phrase[k][i]) in clss[k].tolist()]
                 for k in range(K)]
    index_2=[[i] for i in range(K)]
    pred_phrase[index_2,index_1]=torch.ones(index_phrase.shape)
    return pred_phrase


from sklearn.metrics import confusion_matrix
def confusion_output(sent,output,clss_index):
    a=[confusion_matrix(output[i],sent[i]) for i in range(sent.shape[0])]
    c=[a[i][0][0]+a[i][1][1] for i in range(sent.shape[0])]
    score_total=[round((c[i]-(512-clss_index[i]))/(clss_index[i]),3) for i in range(sent.shape[0])]
    score_1=[(a[i][1][1])/3 for i in range(sent.shape[0])]
    return score_total,score_1


class Summarizer(nn.Module):
    def __init__(self, device,classif):#args, , load_pretrained_bert = False, bert_config = None):
        super(Summarizer, self).__init__()
        self.device = device
        self.bert =CamembertModel(CamembertConfig())#.from_pretrained("camembert-base")
        #BertModel.from_pretrained('bert-base-uncased')
        #Bert(args.temp_dir, load_pretrained_bert, bert_config)
        self.simple_classif = Simple_Classifier(self.bert.config.hidden_size)
        self.multi_classif = Multi_Linear_Classifier(self.bert.config.hidden_size)
        self.select_sent=select_sent
        self.softmax=nn.Softmax()
        self.attclassif=SMHA_classifier(torch.Size([self.bert.config.max_position_embeddings-2,self.bert.config.hidden_size]),8)
        self.loss=nn.functional.binary_cross_entropy
        self.classif_type=classif
        # self.score=confusion_output
        self.to(device)

    def forward(self,x,mask, mask_cls):#,clss,output,k=3):#,segs):#, sentence_range=None): #segs, 
        #x input_ids
        #Segs = Segment pour phrases (0 ou 1), marche pas dans un RoBERTa
        #clss index du début des phrases 
        #mask_cls vecteur pour passer de l'embedding au cls, en gros sélectionne le bon index des vecteurs de l'embedding qu'on va utiliser pour faire la classif
        top_vec= self.bert(x, mask)#, segs)
        # sents_vec=self.sent_vec(last,clss)

        # sents_vec = top_vec[0][torch.arange(top_vec[0].size(0)).unsqueeze(1), clss]
        # sents_vec = sents_vec * mask_cls[:, :, None].float()
        if self.classif_type=='simple_linear':
            sent_scores = self.simple_classif(top_vec.last_hidden_state)#, mask_cls
        elif self.classif_type=='multi_linear':
            sent_scores = self.multi_classif(top_vec.last_hidden_state)
        elif self.classif_type=='attention':
            sent_scores = self.attclassif(top_vec.last_hidden_state)
            sent_scores=sent_scores.mean(dim=2)
        else:
            raise ValueError("Attention, veuillez bien spécifier un type de classifieur.\nSeules les valeurs 'simple_linear', 'multi_linear' ou 'attention' sont acceptées.")
        #sent_scores_masked = torch.mul(sent_scores,mask_cls)
        #sent_scores_masked = self.select_sent(sent_scores_masked,clss,K=k)
        #sent_scores_masked=
        
        # score=self.score(sent_scores,output)
        return sent_scores#_masked#,sent_scores,top_vec.last_hidden_state#,score#, mask_cls

In [4]:
class Summarizer_2(nn.Module):
    def __init__(self, device,classif,mul=False):#args, , load_pretrained_bert = False, bert_config = None):
        super(Summarizer_2, self).__init__()
        self.device = device
#        self.bert =CamembertModel.from_pretrained("camembert-base")
        #BertModel.from_pretrained('bert-base-uncased')
        #Bert(args.temp_dir, load_pretrained_bert, bert_config)
        self.simple_classif = Simple_Classifier(768)
        self.multi_classif = Multi_Linear_Classifier(768)
        self.select_sent=select_sent
        self.softmax=nn.Softmax()
        self.attclassif=SMHA_classifier(torch.Size([512,768]),8)
        self.attlinear=SMHA_Linear_classifier(torch.Size([512,768]),8,768)
        self.loss=nn.functional.binary_cross_entropy
        self.classif_type=classif
        # self.score=confusion_output
        self.to(device)
        self.mul=mul

    def forward(self,top_vec,maks_cls=None):#,clss,output,k=3):#,segs):#, sentence_range=None): #segs, 
        #x input_ids
        #Segs = Segment pour phrases (0 ou 1), marche pas dans un RoBERTa
        #clss index du début des phrases 
        #mask_cls vecteur pour passer de l'embedding au cls, en gros sélectionne le bon index des vecteurs de l'embedding qu'on va utiliser pour faire la classif
        #top_vec= self.bert(x, mask)#, segs)
        # sents_vec=self.sent_vec(last,clss)

        # sents_vec = top_vec[0][torch.arange(top_vec[0].size(0)).unsqueeze(1), clss]
        # sents_vec = sents_vec * mask_cls[:, :, None].float()
        topvec.requires_grad_(True)
        if self.classif_type=='simple_linear':
            if self.mul==True:
                sent_scores = self.simple_classif(top_vec,mask_cls).squeeze(-1)#, mask_cls
            else:
                sent_scores = self.simple_classif(top_vec).squeeze(-1)#, mask_cls
        
        elif self.classif_type=='multi_linear':
            if self.mul==True:
                sent_scores = self.multi_classif(top_vec,mask_cls).squeeze(-1)
            else:
                sent_scores = self.multi_classif(top_vec).squeeze(-1)

        elif self.classif_type=='attention':
            if self.mul==True:
                sent_scores = self.attclassif(top_vec,mask_cls)
                sent_scores=sent_scores.mean(dim=2)
            else:
                sent_scores = self.attclassif(top_vec)
                sent_scores=sent_scores.mean(dim=2)
        elif self.classif_type=='attention_linear':
            if self.mul==True:
                sent_scores = self.attlinear(top_vec,mask_cls)
                sent_scores=sent_scores.mean(dim=2)
            else:
                sent_scores = self.attlinear(top_vec)
                sent_scores=sent_scores.mean(dim=2)
        else:
            raise ValueError("Attention, veuillez bien spécifier un type de classifieur.\nSeules les valeurs 'simple_linear', 'multi_linear' ou 'attention' sont acceptées.")
        #sent_scores_masked = torch.mul(sent_scores,mask_cls)
        #sent_scores_masked = self.select_sent(sent_scores_masked,clss,K=k)
        #sent_scores_masked=
        
        # score=self.score(sent_scores,output)
        return sent_scores#_masked#,sent_scores,top_vec.last_hidden_state#,score#, mask_cls

On installe et charge le modèle Camembert

In [None]:
!pip install transformers

In [5]:
from transformers import CamembertModel,CamembertConfig,AdamW
camem1=CamembertModel(CamembertConfig())
camem2=CamembertModel.from_pretrained("camembert-base")

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Score 
On crée nos fonctions de score : précision, rappel et score F1

In [6]:
class F1_score:
    """
    Class for f1 calculation in Pytorch.
    """

    def __init__(self):#, average: str = 'weighted'):
        """
        Init.

        Args:
            average: averaging method
        """

        #self.average = average
        #if average not in [None, 'micro', 'macro', 'weighted']:
         #   raise ValueError('Wrong value of average parameter')
    @staticmethod
    def true_positive_mean(x,y) -> torch.tensor:
        '''
        Caclul le nombre moyen de vrai positif de la prediction x par rapport aux labels y (binaires).
        '''
        tp=torch.mul(x,y).sum()
        tpm=torch.div(tp,y.shape[0])
        return tpm
    @staticmethod
    def false_positive_mean(x,y) -> torch.tensor:
        '''
        Caclul le nombre moyen de faux négatif de la prediction x par rapport aux labels y (binaires).
        '''
        device=y.device
        fp=torch.sub(x,y)
        fp=torch.max(fp,torch.tensor([0.]).to(device))
        fp=fp.sum().float()
        fpm=torch.div(fp,y.shape[0])
        return fpm
    @staticmethod
    def false_negative_mean(x,y) -> torch.tensor:
        '''
        Caclul le nombre moyen de faux négatif de la prediction x par rapport aux labels y (binaires).
        '''
        fn=torch.sub(y,x)
        device=y.device
        fn=torch.max(fn,torch.tensor([0.]).to(device))
        fn=fn.sum().float()
        fnm=torch.div(fn,y.shape[0])
        return fnm
    #@staticmethod
    def precision(self,x,y) -> torch.tensor:
        device=y.device
        tp=self.true_positive_mean(x,y)
        fp=self.false_positive_mean(x,y)
        if (tp+fp)!=0:
            prec=torch.div(tp,(tp+fp))
            return prec
        else:
            return torch.tensor(0.).to(device)

    def recall(self,x,y) -> torch.tensor:
        tp=self.true_positive_mean(x,y)
        fn=self.false_negative_mean(x,y)
        rec=torch.div(tp,(tp+fn))
        return rec
    def __call__(self,x,y) -> torch.tensor:
        device=y.device
        rec=self.recall(x,y)
        prec=self.precision(x,y)
        f1=torch.mul(rec,prec)
        f1=torch.mul(2,f1)
        f1=torch.div(f1,prec+rec)
        if (prec+rec)!=0:
            return f1#prec,rec,
        else:
            return torch.tensor(0.).to(device)#prec,rec,

        
            

## Fonction de perte
On va maintenant définir notre fonction de perte

In [7]:
class Weighted_Loss:
    '''
    Fonction permettant de calculer la fonction de perte Mean Absolute Error mais pondérée par des poids.
    '''
    def __init__(self,weight,loss_type='L1',binary=True):
        '''
        On initialise notre fonction de perte :
        @weight : les poids que vous voulez pour chaque classe (dim=nombre de classe)
        '''
        self.weights=weight
        self.loss_type=loss_type
        self.binary=binary
        
    def Weighted_L1(self,y_hat,y) -> torch.Tensor:
        '''
        On calcule la fonction :
        @y_hat : les prédictions du modèle
        @y : les vraies valeurs
        
        Attention, dim(y_hat)==dim(y)
        '''
        if y_hat.shape!=y.shape:
            raise ValueError("Attention, les deux inputs n'ont pas la même dimension !")
        #On met les deux tensors sur le même service (ici GPU)
        device_yhat=y_hat.device
        device_y=y.device
        if device_yhat!=device_y:
            y.to(device_yhat)
        
        w=torch.repeat_interleave(torch.tensor(self.weights[0]),y.shape[1])
        w=w.repeat(y.shape[0],1)
                
        if self.binary:
            w[torch.arange(y.shape[0],dtype=torch.long).unsqueeze(1),torch.topk(y,3)[1]]=self.weights[1]
        
        else: #On surpondère les indices qui représentent les phrases, puisque c'est cela que le modèle doit prédire
            x=(y!=torch.tensor(0)).nonzero()
            x_2=torch.index_select(x,1,torch.tensor(1).to(device_yhat)).reshape(-1).to(device_yhat)
            x_1=(x_2==0).nonzero().to(device_yhat)
            sha=torch.arange(y.shape[0],dtype=torch.long).unsqueeze(1).to(device_yhat)
            
            for k in range(len(x_1)):
                if k<(len(x_1)-1):
                    w[sha[k],x_2[x_1[k]:x_1[k+1]]]=self.weights[1]
                else:
                    w[sha[k],x_2[x_1[k]:]]=self.weights[1]
               
        sum_weights=w.sum()
        w=w.to(device)
        sum_weights=sum_weights.to(device)
        errors=torch.sub(y,y_hat)
        errors=torch.abs(errors)
        weighted_errors=torch.mul(w,errors)
        sum_weighted_errors=weighted_errors.sum()
        WMAE=torch.div(sum_weighted_errors,sum_weights)
        #WMAE.requires_grad=True
        return Variable(WMAE,requires_grad=True)#,sum_weighted_errors,sum_weights
    
    def Weighted_Sum(self,y_hat,y) -> torch.Tensor:
        '''
        Calcule la somme pondérée de la différence de la prédiction du modèle et du vecteur cible.
        '''
        if y_hat.shape!=y.shape:
            raise ValueError("Attention, les deux inputs n'ont pas la même dimension !")
        
        #On met les deux tensors sur le même service (ici GPU)
        device_yhat=y_hat.device
        device_y=y.device
        if device_yhat!=device_y:
            y.to(device_yhat)
        
        w=torch.repeat_interleave(torch.tensor(self.weights[0]),y.shape[1])
        w=w.repeat(y.shape[0],1)
        
        if self.binary:
            w[torch.arange(y.shape[0],dtype=torch.long).unsqueeze(1),torch.topk(y,3)[1]]=self.weights[1]
        
        else: #On surpondère les indices qui représentent les phrases, puisque c'est cela que le modèle doit prédire
            x=(y!=torch.tensor(0)).nonzero()
            x_2=torch.index_select(x,1,torch.tensor(1).to(device_yhat)).reshape(-1).to(device_yhat)
            x_1=(x_2==0).nonzero().to(device_yhat)
            sha=torch.arange(y.shape[0],dtype=torch.long).unsqueeze(1).to(device_yhat)
            
            for k in range(len(x_1)):
                if k<(len(x_1)-1):
                    w[sha[k],x_2[x_1[k]:x_1[k+1]]]=self.weights[1]
                else:
                    w[sha[k],x_2[x_1[k]:]]=self.weights[1]
                    
        w=w.to(device_yhat)
        y_diff=torch.abs(torch.sub(y,y_hat))
        y_diff_pond=torch.mul(y_diff,w)
        sum_y_diff_pon=torch.div(torch.sum(y_diff_pond),y_hat.shape[0])
        return Variable(sum_y_diff_pon,requires_grad=True)
    
    def __call__(self,y_hat,y) -> torch.Tensor:
        if self.loss_type=='L1':
            loss=self.Weighted_L1(y_hat,y)
            return loss
        elif self.loss_type=='sum':
            loss=self.Weighted_Sum(y_hat,y)
            return loss
        else:
            raise ValueError("Attention, veuillez bien spécifier un type de perte.\nSeules les valeurs 'L1' ou 'sum' sont acceptées.")
        

In [None]:
x=(out!=torch.tensor(0)).nonzero()
x_2=torch.index_select(x,1,torch.tensor(1)).reshape(-1)
x_1=(x_2==0).nonzero()

from tqdm import tqdm
w=torch.repeat_interleave(torch.tensor(1),512)
w=w.repeat(len(x_1),1)
sha=torch.arange(len(x_1),dtype=torch.long).unsqueeze(1)
for k in tqdm(range(len(x_1))):
    if k<(len(x_1)-1):
        w[sha[k],x_2[x_1[k]:x_1[k+1]]]=4
    else:
        w[sha[k],x_2[x_1[k]:]]=4
w

In [None]:
x=torch.rand(100,14)
y=torch.rand(100,14)
loss=Weighted_Loss(weight=[torch.tensor(0.5),torch.tensor(1000.)],loss_type='sum',binary=False)
print(loss(x,y))
perte=torch.nn.L1Loss()
print(perte(x,y))
print("On voit que la première est plus élevée que la deuxième, c'est ce que l'on cherche dans notre cas !")

In [14]:
x=torch.rand(100,14)
y=torch.rand(100,14)
loss=torch.nn.MSELoss()
loss(x,y).item()

0.16317161917686462

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable

y = Variable(torch.rand(2,10),requires_grad=True)#Variable(torch.rand(5, 3), requires_grad=True)
t = torch.rand(2,10)#Variable(torch.LongTensor(5).random_(0, 2))
m=Weighted_Loss(weight=[torch.tensor(0.5),torch.tensor(1000.)])
#m = nn.L1Loss()#MultiMarginLoss()
loss = m(y, t)
loss.backward()
print(y.grad)

In [None]:
x=torch.rand(100,14)
print(x)
x=x.to(device)
print(x)

## On prépare l'entraînement

On vérifie que du GPU est disponible

In [7]:
torch.cuda.is_available()

True

In [8]:
device = torch.device("cuda")

On charge nos données d'entraînement

In [None]:
#train_dataset=pickle.load(open('train.pickle','rb'))

On crée notre dataloader, indispensable pour créer une boucle d'entraînement sous torch, on fixe notre taille de batch

In [14]:
summa_parallel.to('cpu'),input_id.to('cpu'),mask.to('cpu'),mask_cls.to('cpu'),output.to('cpu')
del summa_parallel,input_id,mask,mask_cls,output,dataloader_2
torch.cuda.empty_cache()

In [44]:
batch_size=int(1024/64)
print(batch_size)

dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size)
len(dataloader)

16


8513

In [9]:
output_=torch.stack([train_dataset[i][-1] for i in range(len(train_dataset))])
mask_cls_=torch.stack([train_dataset[i][-2] for i in range(len(train_dataset))])
output_2=torch.mul(torch.div(output_-torch.min(output_),torch.max(output_)-torch.min(output_)),mask_cls_)
output_2

K=2000
train_2=TensorDataset(torch.stack([train_dataset[i][0] for i in range(K)]),
                      torch.stack([train_dataset[i][1] for i in range(K)]),
                      torch.stack([train_dataset[i][2] for i in range(K)]),
                      torch.stack([train_dataset[i][3] for i in range(K)]),
                      output_2[:K])

batch_size=int(1024/8/8)
print(batch_size)

dataloader_2 = DataLoader(
            train_2,
            sampler = RandomSampler(train_2),
            batch_size = batch_size)

16


A ne lancer que si on veut ré-initialiser la mémoire sur GPU

In [18]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0) 
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
f/1024/1024/1024


12.718575477600098

On place le modèle sur GPU, on défini notre algorithme d'apprentissage, notre score et notre fonction de perte

In [108]:
training_stats = []
epochs=5

summa=Summarizer_2(device=device,classif='multi_linear')
summa_parallel=nn.DataParallel(summa) # On va distribuer sur plusieurs GPU, 2 ici pour avoir plus de mémoire

#optimizer = AdamW(summa_parallel.parameters(),
             #     lr = 2e-4, # Learning Rate
              #    eps = 1e-3)# Epsilon
optimizer = optim.SGD(summa_parallel.parameters(), lr=0.00001, momentum=0.9)
score=F1_score()

alpha=0.95
#weights=torch.Tensor([(1/(1-alpha))*1/((512-3)/512),(1/alpha)*1/((3)/512)])
#weights=torch.Tensor([(1/(1-alpha))*1/((1)/512),(1/alpha)*1/((512-1)/512)])
weights=torch.Tensor([1,1])
loss_3=Weighted_Loss(weight=weights,loss_type='sum',binary=True)
loss_2=Weighted_Loss(weight=weights,loss_type='L1',binary=True)
loss=nn.MSELoss()#nn.L1Loss()

In [11]:
score=F1_score()

alpha=0.5
weights=torch.Tensor([(1/(1-alpha))*1/((512-20)/512),(1/alpha)*1/((20)/512)])
#weights=torch.Tensor([(1/(1-alpha))*1/((1)/512),(1/alpha)*1/((512-1)/512)])
#weights=torch.Tensor([1,1])
loss_3=Weighted_Loss(weight=weights,loss_type='sum',binary=False)
loss_2=Weighted_Loss(weight=weights,loss_type='L1',binary=False)
loss=nn.MSELoss()#nn.L1Loss()

In [263]:
m = nn.Conv1d(16, 16, kernel_size=3, stride=2)
input = torch.randn(20, 16, 50)
output = m(input)
print(input.shape,'\n',output.shape)
m=nn.MaxPool1d(2,2)
output=m(output)
print(output.shape)
m = nn.Conv1d(16, 16, kernel_size=3, stride=2)
output=m(output)
print(output.shape)
m = nn.Conv1d(16, 16, kernel_size=4, stride=2)
output=m(output)
print(output.shape)

torch.Size([20, 16, 50]) 
 torch.Size([20, 16, 24])
torch.Size([20, 16, 12])
torch.Size([20, 16, 5])
torch.Size([20, 16, 1])


In [10]:

class Net(nn.Module):
    def __init__(self,k1,k2,k3,s1,s2,s3):
        super().__init__()
        self.conv1 = nn.Conv1d(512, 512, kernel_size=k1,stride=s1)
        self.pool = nn.MaxPool1d(k2, s2)
        self.conv2 = nn.Conv1d(512, 512, kernel_size=k3,stride=s3)
        self.dim=int((768-k1)/s1)+1
        self.dim=int((self.dim-(k2-1)-1)/s2+1)
        self.dim=int((self.dim-k3)/s3)+1
        self.fc1 = nn.Linear(self.dim, int(self.dim/2))
        self.fc2 = nn.Linear(int(self.dim/2), int(self.dim/8))
        self.fc3 = nn.Linear(int(self.dim/8), 1)
        self.LReLu=nn.LeakyReLU(negative_slope= 0.01)
        self.softmax=nn.Softmax(dim=-1)

    def forward(self, x):
        x.requires_grad_(True)
        x = self.pool(self.LReLu(self.conv1(x)))
        x =self.LReLu(self.conv2(x))
        #x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.LReLu(self.fc1(x))
        x = self.LReLu(self.fc2(x))
        x = self.fc3(x)
        #x=self.softmax(x)
        return x.flatten(1)


#net = Net(2**8,2**6,2,2,2,2).to(device)
#import torch.optim as optim
#optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.09)


In [14]:
mlc=Multi_Linear_Classifier(camem2.config.hidden_size)
mlc_optimizer=optim.SGD(mlc.parameters(), lr=0.001, momentum=0.09)

slc=Simple_Classifier(camem2.config.hidden_size)
slc_optimizer=optim.SGD(slc.parameters(), lr=0.001, momentum=0.09)

att_c=SMHA_classifier(torch.Size([512,768]),8)
att_c_optimizer=optim.SGD(att_c.parameters(), lr=0.001, momentum=0.09)

att_lin_c=SMHA_Linear_classifier(torch.Size([512,768]),8,768)
att_lin_c_optimizer=optim.SGD(att_lin_c.parameters(), lr=0.001, momentum=0.09)

convnet=Net(2**8,2**6,2,2,2,2)
convnet_optimizer=optim.SGD(convnet.parameters(), lr=0.001, momentum=0.09)

In [None]:
input=torch.rand(16,512,768).to(device)
output=torch.rand(16,512).to(device)

for m,o in zip([mlc,slc,att_c,att_lin_c,convnet],[mlc_optimizer,slc_optimizer,att_c_optimizer,att_lin_c_optimizer,convnet_optimizer]):
    print("\n Modèle :",str(m).split('(')[0],'\n')
    
    model=m.to(device)
    param1=list(model.parameters())[0].clone()

    sortie=model(input)

    for l in [loss,loss_2,loss_3]:
        ouais=l(sortie,output)
        o.zero_grad()
        ouais.backward(retain_graph=True)
        o.step()
        
        param2=list(model.parameters())[0].clone()
        print("For loss,",l,"Did the grad updated the weights ?",bool(1-torch.equal(param1.data,param2.data)))

## Boucle d'entraînement
On passe maintenant à l'entraînement !

In [None]:
def make_output_topk(x,k=3):
    out=torch.zeros(x.shape)
    ind=torch.topk(x,k=k)[1]
    ind1=torch.arange(x.shape[0]).unsqueeze(1)
    out[ind1,ind]=1
    return out

output2=make_output_topk(output)
pred2=make_output_topk(pred[0])

LCE=torch.nn.CrossEntropyLoss()
LCE(pred2,output2)

In [None]:
#Pour enregistrer les informations de l'entraînement
training_stats = []
score_stat=[]
#loss.requires_grad=True
pred_output={}
# Boucle d'entrainement
data=dataloader_2
camem2.to('cpu')
epochs=5
model=convnet
optimizer=convnet_optimizer
for epoch in range(0, epochs):
     
    #print("")
    #print(f'########## Epoch {epoch+1} / {epochs} ##########')
    #print('Training...')
 
 
    # On initialise la loss pour cette epoque
    total_train_loss = 0
    total_train_loss_2 = 0
    total_train_loss_3 = 0
    score_e=0
    pred=[]
    # On met le modele en mode 'training'
    # Dans ce mode certaines couches du modele agissent differement
    model.train()
    model.to(device)
    #net.train()
    # Pour chaque batch
    for step, batch in enumerate(tqdm(data)):
 
        # On fait un print chaque 40 batchs
      # if step % 150 == 0 and not step == 0:
       #     print(f'  Batch {step}  of {len(data)}.')
         
        # On recupere les donnees du batch
        input_id = batch[0]#.to(device)
        mask = batch[1]#.to(device)
        clss = batch[2].float().to(device)
        mask_cls=batch[3]#.to(device)
        output=batch[4].float().to(device)
 
        # On met le gradient a 0
        optimizer.zero_grad()#summa_parallel.zero_grad()        
 
        # On passe la donnee au model et on recupere la loss et le logits (sortie avant fonction d'activation)
        topvec=camem2(input_id,mask)
        topvec=topvec.last_hidden_state.to(device)
        #topvec=topvec.mul(mask_cls.unsqueeze(2)).to(device)
        sortie=model(topvec)
        #(x=input_id,mask=mask,mask_cls=mask_cls)#,clss=clss,output=output)
        
        #On indique qu'on souhaite tracer le gradient de la sortie dans la fonction de perte
        #sortie=Variable(sortie,requires_grad=True)
        pred.append(sortie)
        #On calcule et garde le score pour information
        score_e+=score(sortie,output)
        #print(score_stat[step])
        #output2=make_output_topk(output,k=1).long().to(device)
        loss_train=loss(sortie,output)
        loss_train_2=loss_2(sortie,output)
        loss_train_3=loss_3(sortie,output)

        #print(loss_train)
        # On incremente la loss totale
        # .item() donne la valeur numerique de la loss
        total_train_loss += loss_train.item()
        total_train_loss_2 += loss_train_2.item()
        total_train_loss_3 += loss_train_3.item()

        # Backpropagtion
        loss_train_3.backward()
 
        # On actualise les paramètres grace a l'optimizer
        optimizer.step()
    score_stat=score_e/len(data)
    # On calcule la  loss moyenne sur toute l'epoque
    avg_train_loss = total_train_loss / len(data)   
    avg_train_loss_2 = total_train_loss_2 / len(data)   
    avg_train_loss_3 = total_train_loss_3 / len(data)   
    pred_output[epoch]=pred
    print("")
    print("  Average training loss MLML: {0:.4f}".format(avg_train_loss),
         "  Average training loss L1: {0:.4f}".format(avg_train_loss_2),
         "  Average training loss sum: {0:.4f}".format(avg_train_loss_3),"  Average f1 score: {0:.4f}".format(score_stat))  
     
    # Enregistrement des stats de l'epoque
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss MSE': avg_train_loss,
            'Training Loss L1': avg_train_loss_2,
            'Training Loss sum': avg_train_loss_3,
        }
    )

print("Model saved!")
torch.save(model.state_dict(), "model_essai.pt")


  w=torch.repeat_interleave(torch.tensor(self.weights[0]),y.shape[1])
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  x=(y!=torch.tensor(0)).nonzero()
  w=torch.repeat_interleave(torch.tensor(self.weights[0]),y.shape[1])
  7%|▋         | 9/125 [00:19<04:31,  2.34s/it]

In [325]:
summa=Multi_Linear_Classifier(768)#Summarizer_2(device=device,classif='multi_linear')
#summa_parallel=nn.DataParallel(summa) # On va distribuer sur plusieurs GPU, 2 ici pour avoir plus de mémoire

import torch.optim as optim
optimizer = optim.SGD(summa.parameters(), lr=0.001, momentum=0.09)
score=F1_score()

alpha=0.95
#weights=torch.Tensor([(1/(1-alpha))*1/((512-3)/512),(1/alpha)*1/((3)/512)])
#weights=torch.Tensor([(1/(1-alpha))*1/((1)/512),(1/alpha)*1/((512-1)/512)])
weights=torch.Tensor([1,1])
loss_3=Weighted_Loss(weight=weights,loss_type='sum',binary=True)
loss_2=Weighted_Loss(weight=weights,loss_type='L1',binary=True)
loss=nn.MSELoss()

In [306]:
import torch.nn.functional as F
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(768, 1)
        self.LReLu=nn.LeakyReLU(negative_slope= 0.01)

    def forward(self, x):
        x = self.LReLu(self.fc1(x.requires_grad_(True)))
        return x.flatten(1)


net = Net()#.to(device)
import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=0.01)

In [330]:
sortie.squeeze(2).shape

torch.Size([16, 512])

In [None]:
model=summa.to(device)
param1=list(model.parameters())[0].clone()

#input=torch.rand(16,512,768).to(device)
#output=torch.rand(16,512).to(device)

sortie=model(topvec).squeeze(2)#topvec.requires_grad_(True))
#sortie=sortie.requires_grad_(True)
ouais=loss(sortie,output)
# Visiblement les loss faîtes mains ne sont pas optimisables pour le moment ?
# Ah bah si pour le ConvNet ???
optimizer.zero_grad()
ouais.backward(retain_graph=True)
optimizer.step()
param2=list(model.parameters())[0].clone()
torch.equal(param1.data,param2.data)

In [None]:
print(optimizer)
optimizer.zero_grad()
print(optimizer)
ouais=loss_3(pred[0],output2)
ouais.backward()
optimizer.step()
print(optimizer)

In [None]:
for par in summa_parallel.parameters():
    print(par.grad)

In [257]:
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F

class TEMP(nn.Module):

    # Whole architecture
    def __init__(self):
        super(TEMP, self).__init__()
        self.input = nn.Parameter(torch.ones(1,requires_grad = True)) # <----wrap it like this


    def forward(self,x):
        wt = self.input
        y = wt*x 
        return y

model = TEMP()
param1=list(model.parameters())[0].clone()

optimizer = optim.Adam(model.parameters(), lr=0.001)
x = torch.randn(100)
y = 5*x
loss_ex = torch.sum((y - model(x)).pow(2))
optimizer.zero_grad()
loss_ex.backward()

optimizer.step()
param2=list(model.parameters())[0].clone()
print(param1==param2)
print(model.input)

tensor([False])
Parameter containing:
tensor([1.0010], requires_grad=True)


In [240]:
input=torch.rand(16,512,768)
input.shape

torch.Size([16, 512, 768])

In [260]:
import torch.nn.functional as F
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(768, 1)
        self.LReLu=nn.LeakyReLU(negative_slope= 0.01)
        #self.weights_model = nn.Parameter(torch.ones(1,requires_grad = True)) # <----wrap it like this


    def forward(self, x):
        x = self.LReLu(self.fc1(x.requires_grad_(True)))
        return x.flatten(1)


net = Net()#.to(device)
import torch.optim as optim
#optimizer = optim.SGD(net.parameters(), lr=0.001)
optimizer = optim.Adam(net.parameters(), lr=0.001)

input=torch.rand(16,512,768)
output=torch.rand(16,512)


param1=list(net.parameters())[0].clone()
sortie=net(input)
#sortie=sortie.requires_grad_(True)
ouais=loss(sortie,output)

optimizer.zero_grad()
ouais.backward(retain_graph=True)
optimizer.step()

param2=list(net.parameters())[0].clone()
torch.equal(param1.data,param2.data)

False

In [None]:
from transformers import CamembertModel,BertModel,RobertaModel,AdamW,BertConfig
bert=BertModel(BertConfig())
top_vec=bert(input_id,attention_mask=mask)
top_vec=top_vec[0]
top_vec

In [14]:
clss=clss.to('cpu')
clss

tensor([[  0.,  36., 100.,  ...,   0.,   0.,   0.],
        [  0.,  50.,  83.,  ...,   0.,   0.,   0.],
        [  0.,  40.,  68.,  ...,   0.,   0.,   0.],
        ...,
        [  0.,  22.,  37.,  ...,   0.,   0.,   0.],
        [  0.,  18.,  35.,  ...,   0.,   0.,   0.],
        [  0.,   0.,   0.,  ...,   0.,   0.,   0.]])

In [18]:
clss[torch.arange(clss.size(0)).unsqueeze(1),:(clss.topk(k=1)[1]+1)]

TypeError: only integer tensors of a single element can be converted to an index

In [310]:
sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), clss]
sents_vec = sents_vec * mask_cls[:, :, None].float()
sent_scores = self.ext_layer(sents_vec, mask_cls).squeeze(-1)

tensor([-0.3635, -0.3541, -0.3638, -0.3740, -0.3593, -0.3593, -0.3632, -0.3635,
        -0.3593, -0.3698, -0.3641, -0.3642, -0.3590, -0.3635, -0.3595, -0.3830,
        -0.3593, -0.3593, -0.3593, -0.3593, -0.3632, -0.3629, -0.3640, -0.3646,
        -0.3593, -0.3645, -0.3597, -0.3690, -0.3593, -0.3641, -0.3664, -0.3592,
        -0.3508, -0.3585, -0.3593, -0.3593, -0.3591, -0.3593, -0.3593, -0.3674,
        -0.3593, -0.3593, -0.3654, -0.3560, -0.3601, -0.3593, -0.3593, -0.3626,
        -0.3637, -0.3603, -0.3654, -0.3593, -0.3593, -0.3650, -0.3597, -0.3593,
        -0.3593, -0.3575, -0.3642, -0.3584, -0.3667, -0.3571, -0.3594, -0.3592,
        -0.3602, -0.3593, -0.3639, -0.3639, -0.3638, -0.3593, -0.3603, -0.3644,
        -0.3618, -0.3644, -0.3622, -0.3594, -0.3576, -0.3603, -0.3593, -0.3653,
        -0.3639, -0.3593, -0.3593, -0.3644, -0.3595, -0.3631, -0.3677, -0.3633,
        -0.3645, -0.3593, -0.3638, -0.3593, -0.3701, -0.3597, -0.3593, -0.3651,
        -0.3706, -0.3610, -0.3635, -0.35

In [35]:
train_loss=[training_stats[i]['Training Loss'] for i in range(epochs)]

In [36]:
score_par_epoch=[torch.mean(torch.as_tensor(score_stat[x:x+57])) for x in range(0,len(score_stat),57)]

In [None]:
import matplotlib.pyplot as plt
fig,ax=plt.subplots(2,figsize=(18,14))
ax[0].plot([i for i in range(epochs)],train_loss)
ax[1].plot([i for i in range(epochs)],score_par_epoch)
ax[1].set(xlabel="époques",ylabel="score loss",title="score par époques")

In [None]:
#CEL=nn.CrossEntropyLoss()
#CEL(pred_output[0][0][0],output[0])
[torch.sum(-output[i]+pred_output[0][i][0]) for i in range(output.shape[0])]

In [None]:
output.shape

In [None]:
pickle.dump(pred_output,open('pred_output.pickle','wb'))

In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
score_stat

In [None]:
avg_train_loss

In [None]:
validation=pickle.load(open('validation.pickle','rb'))
valid_dataloader = DataLoader(
            validation,
            sampler = RandomSampler(validation),
            batch_size = 8)

In [None]:
#summa=torch.load("model_essai.pt")
device = torch.device("cuda")
summa=Summarizer(device=device)
summa.load_state_dict(torch.load("model_essai.pt"))
#summa

In [None]:
summa.eval()
for step,batch in enumerate(valid_dataloader):
    if step==0:
        input_id_valid = batch[0].to(device)
        mask_valid = batch[1].to(device)
        clss_valid = batch[2].to(device)
        mask_cls_valid=batch[3].to(device)
        output_valid=batch[4].to(device)
        with torch.no_grad():
            pred=summa(x=input_id_valid,mask=mask_valid,clss=clss_valid,mask_cls=mask_cls_valid,output=output_valid)
            print(pred)
    #valid.append(batch)
    else:
        break



In [None]:
f1=F1_score()
f1(pred,output_valid)

In [None]:
import numpy as np
torch.tensor([1/(3/512),1/((512-3)/512)])


In [None]:
CEL=torch.nn.CrossEntropyLoss(weight=torch.tensor([1/(3/512),1/((512-3)/512)]))
CEL()

In [None]:
loss = nn.CrossEntropyLoss()
input = pred[0]#torch.randn(3, 5, requires_grad=True)
target = output_valid[0]#torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output

## Essai

In [158]:
loss = nn.MSELoss()
loss(pred[0],output[0])

  return F.mse_loss(input, target, reduction=self.reduction)


tensor(0.0072, device='cuda:0', grad_fn=<MseLossBackward>)

In [155]:
pred[0].shape,output.shape

(torch.Size([16, 512]), torch.Size([16, 512]))

In [130]:
topvec=camem(input_id,mask)

In [172]:
score=F1_score()
alpha=0.5
weights=torch.Tensor([(1/(1-alpha))*1/((512-20)/512),(1/alpha)*1/((20)/512)])
loss=Weighted_Loss(weight=torch.Tensor([1,1]),loss_type='L1',binary=True)#reduction = 'batchmean')
loss_4=Weighted_Loss(weight=torch.Tensor([1,1]),loss_type='L1',binary=False)#reduction = 'batchmean')
loss_5=Weighted_Loss(weight=torch.Tensor([1,1]),loss_type='sum',binary=False)#reduction = 'batchmean')
loss_2=Weighted_Loss(weight=weights,loss_type='L1',binary=False)
loss_3=Weighted_Loss(weight=weights,loss_type='sum',binary=False)
loss_2_=Weighted_Loss(weight=weights,loss_type='L1',binary=True)
loss_3_=Weighted_Loss(weight=weights,loss_type='sum',binary=True)
last=topvec.last_hidden_state
#SMHA=SMHA_Linear_classifier(torch.Size([512,768]),8,768)
SMHA=SMHA_classifier(torch.Size([512,768]),8)
sortie=SMHA(last).mean(dim=2).to(device)
print("Loss poids égaux, type L1, binary=True :",loss(sortie,output))
print("Loss poids égaux, type L1, binary=False :",loss_4(sortie,output))
print("Loss poids égaux, type sum, binary=False :",loss_5(sortie,output))
print("Loss poids différents, type L1, binary=False :",loss_2(sortie,output))
print("Loss poids différents, type sum, binary=False :",loss_3(sortie,output))
print("Loss poids différents, type L1, binary=True :",loss_2_(sortie,output))
print("Loss poids différents, type sum, binary=True :",loss_3_(sortie,output))
#,loss_2(sortie,output),loss_3(sortie,output),loss_4(sortie,output)

Loss poids égaux, type L1, binary=True : tensor(0.3859, device='cuda:0', requires_grad=True)
Loss poids égaux, type L1, binary=False : tensor(0.3859, device='cuda:0', requires_grad=True)
Loss poids égaux, type sum, binary=False : tensor(197.5638, device='cuda:0', requires_grad=True)
Loss poids différents, type L1, binary=False : tensor(0.2720, device='cuda:0', requires_grad=True)
Loss poids différents, type sum, binary=False : tensor(451.0387, device='cuda:0', requires_grad=True)
Loss poids différents, type L1, binary=True : tensor(0.3513, device='cuda:0', requires_grad=True)
Loss poids différents, type sum, binary=True : tensor(426.0840, device='cuda:0', requires_grad=True)


  w=torch.repeat_interleave(torch.tensor(self.weights[0]),y.shape[1])
  w=torch.repeat_interleave(torch.tensor(self.weights[0]),y.shape[1])


In [174]:
weights

tensor([ 2.0813, 51.2000])