In [None]:
import torch
from torch import nn
import pandas as pd
from transformers import AutoTokenizer
import numpy as np
from tqdm import tqdm
from transformers import GPT2LMHeadModel
from transformers import GPT2Config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("../data/tokenizers_character_level/moses_ClearSMILES_corrected")
vocab = tokenizer.get_vocab()

In [4]:
vocab

{'(': 8,
 '2': 9,
 ')': 0,
 '<unk>': 11,
 'S': 7,
 'C': 5,
 'Cl': 15,
 '1': 16,
 'O': 6,
 '=': 13,
 '<pad>': 12,
 'F': 14,
 '<eos>': 4,
 '<bos>': 3,
 'Br': 10,
 '#': 2,
 'N': 1}

In [3]:
abvoc = {}
for k in vocab.keys():
    abvoc[vocab[k]] = k
abvoc

{11: '<unk>',
 4: '<eos>',
 5: 'C',
 12: '<pad>',
 13: '=',
 8: '(',
 7: 'S',
 3: '<bos>',
 16: '1',
 9: '2',
 1: 'N',
 6: 'O',
 15: 'Cl',
 14: 'F',
 2: '#',
 10: 'Br',
 0: ')'}

In [None]:
moses_dataset = pd.read_csv("../data/training_data/moses_ClearSMILES.csv")
#D = len(moses_dataset[moses_dataset["SPLIT"] =="train"]["SMILES"])


In [None]:
idf =[0]*len(tokenizer) 
D = len(moses_dataset["SMILES"])
for k in vocab.keys():
    for l in moses_dataset["SMILES"]:
        if k in l:
            idf[vocab[k]] +=1
idf = [np.log(D/k) if k!=0 else 0 for k in idf]


In [30]:
idf


[0.0031287468135231685,
 0.009568700773703875,
 2.5602130118820057,
 0,
 0,
 0.0,
 0.0357366804309882,
 1.1437764470791782,
 0.0031287468135231685,
 1.0860727903801972,
 3.413334466828707,
 0,
 0,
 0.00017998683182921782,
 1.666313076800451,
 2.249299868646848,
 0.00038393511170831525]

In [31]:
vocab

{'<unk>': 11,
 '<eos>': 4,
 'C': 5,
 '<pad>': 12,
 '=': 13,
 '(': 8,
 'S': 7,
 '<bos>': 3,
 '1': 16,
 '2': 9,
 'N': 1,
 'O': 6,
 'Cl': 15,
 'F': 14,
 '#': 2,
 'Br': 10,
 ')': 0}

In [None]:
model = GPT2LMHeadModel.from_pretrained("../models/trained_moses_ClearSMILES_corrected_character_level/2/final_model")
model.resize_token_embeddings(len(tokenizer))
loss_list = []
weights = [ idf[k] for k in range(len(idf))]
criterion_weighted = nn.CrossEntropyLoss(weight=torch.tensor(weights))
for k in tqdm(moses_dataset["SMILES"][1000:2000]):
    x = tokenizer(k,return_tensors='pt')
    inputs = x["input_ids"].squeeze()
    model = model.double()

    y = model(x["input_ids"].squeeze()).logits

    loss_weighted = criterion_weighted(y,inputs)
    loss_list.append(loss_weighted)
m = 0
for k in loss_list:
    m+=k/len(loss_list)
m

100%|██████████| 1000/1000 [00:14<00:00, 69.41it/s]


tensor(11.3874, dtype=torch.float64, grad_fn=<AddBackward0>)

In [None]:
model = GPT2LMHeadModel.from_pretrained("../models/trained_moses_ClearSMILES_corrected_character_level/2/final_model")
model.resize_token_embeddings(len(tokenizer))
loss_list = []

criterion_weighted = nn.CrossEntropyLoss()

In [7]:
k = moses_dataset["SMILES"][0]
x = tokenizer(k,return_tensors='pt')
inputs = x["input_ids"].squeeze()
model = model.double()

y = model(x["input_ids"].squeeze()).logits

loss_weighted = criterion_weighted(y,inputs)

In [37]:
x

{'input_ids': tensor([[ 3, 14,  5, 16, 13,  5,  5, 13,  5,  8, 14,  0,  5, 13,  5, 16,  1,  7,
          8, 13,  6,  0,  8, 13,  6,  0,  5, 16, 13,  5,  5, 13,  5,  8, 14,  0,
          5,  8, 14,  0, 13,  5, 16, 14,  4]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [36]:
inputs.shape

torch.Size([45])

In [19]:
inputs[l].item()

4

In [165]:
min_loss = criterion_weighted(true_y.double(), y.double())
min_loss


tensor(1.9295, dtype=torch.float64)

In [None]:
k = moses_dataset["SMILES"][0]
x = tokenizer(k,return_tensors='pt')
inputs = x["input_ids"].squeeze()
y = model(x["input_ids"].squeeze()).logits

In [174]:
k = moses_dataset["SMILES"][0]
x = tokenizer(k,return_tensors='pt')
inputs = x["input_ids"].squeeze()
y = model(x["input_ids"].squeeze()).logits
weights = [ idf[k] for k in range(len(idf))]
nn.functional.cross_entropy(y.double(),target=inputs,weight=torch.tensor(weights),reduction="mean")

tensor(13.8270, dtype=torch.float64, grad_fn=<NllLossBackward0>)

In [170]:
nn.functional.cross_entropy(true_y.double(),target=y.double(),reduction="mean")

tensor(1.9295, dtype=torch.float64)

In [104]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.randint(5, (3,), dtype=torch.int64)
loss = nn.functional.cross_entropy(input, target)
print(loss)
loss.backward()

tensor(1.9678, grad_fn=<NllLossBackward0>)


In [105]:
loss.float()

tensor(1.9678, grad_fn=<NllLossBackward0>)

In [9]:
config =   GPT2Config(
            vocab_size=len(tokenizer),  # 10,000 tokens( pour BEP )
            n_positions=tokenizer.model_max_length , # ça ne génèrera que des smiles de la même taille
            n_ctx=tokenizer.model_max_length,  # ça ne génèrera que des smiles de la même taille
            n_embd=256,
            n_layer=8,
            n_head=8,
            resid_pdrop=0.1,
            embd_pdrop=0.1,
            attn_pdrop=0.1
        )

model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(tokenizer))

Embedding(17, 256)

In [182]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim

# ==================== OPTION 1: Classe Loss personnalisée ====================
class WeightedLanguageModelLoss(nn.Module):
    def __init__(self, vocab_weights=None, ignore_index=-100, label_smoothing=0.0):
        """
        Loss function pour un language model avec poids personnalisés
        
        Args:
            vocab_weights: Tensor de poids pour chaque token du vocabulaire [vocab_size]
            ignore_index: Index à ignorer dans le calcul (padding tokens)
            label_smoothing: Lissage des labels (0.0 = pas de lissage)
        """
        super().__init__()
        self.vocab_weights = vocab_weights
        self.ignore_index = ignore_index
        self.label_smoothing = label_smoothing
    
    def forward(self, logits, labels):
        """
        Args:
            logits: [batch_size, seq_len, vocab_size] ou [batch_size*seq_len, vocab_size]
            labels: [batch_size, seq_len] ou [batch_size*seq_len]
        """
        # Si les logits ont 3 dimensions, on fait le shift pour LM
        if logits.dim() == 3:
            # Shift pour prédiction du token suivant
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            
            # Reshape pour cross_entropy
            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
            shift_labels = shift_labels.view(-1)
        else:
            shift_logits = logits
            shift_labels = labels
        
        # Calcul de la cross entropy
        loss = F.cross_entropy(
            shift_logits,
            shift_labels,
            weight=self.vocab_weights,
            ignore_index=self.ignore_index,
            label_smoothing=self.label_smoothing,
            reduction='mean'
        )
        
        return loss

# ==================== OPTION 2: Fonction Loss simple ====================
def compute_language_model_loss(model_output, input_ids, vocab_weights=None):
    """
    Fonction simple pour calculer la loss d'un language model
    """
    logits = model_output.logits  # [batch_size, seq_len, vocab_size]
    
    # Shift pour prédiction du token suivant
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = input_ids[..., 1:].contiguous()
    
    # Reshape
    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
    shift_labels = shift_labels.view(-1)
    
    # Cross entropy
    loss = F.cross_entropy(
        shift_logits,
        shift_labels,
        weight=vocab_weights,
        reduction='mean'
    )
    
    return loss

# ==================== EXEMPLE D'USAGE DANS UNE BOUCLE D'ENTRAÎNEMENT ====================

# Préparation des poids IDF
vocab_size = len(tokenizer.vocab)
idf_weights = torch.tensor(weights, dtype=torch.float32)

# Option 1: Avec classe Loss
criterion = WeightedLanguageModelLoss(vocab_weights=idf_weights)








In [225]:
true_y = []
for k in range(len(inputs)):
    mmmm = [1]*len(tokenizer)
    mmmm[inputs[k]] = 5
    true_y.append(mmmm)
true_y = torch.tensor(true_y)

y = []
for k in range(len(inputs)):
    mmmm = [0]*len(tokenizer)
    mmmm[inputs[k]] = 9000
    y.append(mmmm)
y = torch.tensor(y)

In [228]:
criterion(y.float(), inputs)

tensor(0.)

In [226]:
criterion(true_y.float(), inputs)

tensor(0.2570)