In [1]:
import torch
from torch import nn
import pandas as pd
from transformers import AutoTokenizer
import numpy as np
from tqdm import tqdm
from transformers import GPT2LMHeadModel
from transformers import GPT2Config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("../data/tokenizers_character_level/moses_ClearSMILES_corrected")
vocab = tokenizer.get_vocab()

In [3]:
vocab

{'o': 23,
 '3': 19,
 'F': 5,
 'C': 22,
 'n': 12,
 '6': 10,
 '=': 18,
 's': 6,
 ')': 11,
 '2': 1,
 'Br': 14,
 '4': 0,
 'c': 24,
 '<eos>': 16,
 'Cl': 17,
 '-': 3,
 '#': 7,
 'O': 26,
 '1': 13,
 '5': 8,
 '<bos>': 20,
 '(': 9,
 '<unk>': 21,
 'N': 2,
 'S': 4,
 '<pad>': 25,
 '[nH]': 15}

In [3]:
abvoc = {}
for k in vocab.keys():
    abvoc[vocab[k]] = k
abvoc

{15: 'Cl',
 6: 'O',
 4: '<eos>',
 2: '#',
 16: '1',
 0: ')',
 3: '<bos>',
 12: '<pad>',
 13: '=',
 9: '2',
 10: 'Br',
 5: 'C',
 14: 'F',
 1: 'N',
 8: '(',
 7: 'S',
 11: '<unk>'}

In [4]:
moses_dataset = pd.read_csv("../data/training_data/moses_ClearSMILES.csv")
#D = len(moses_dataset[moses_dataset["SPLIT"] =="train"]["SMILES"])
D = len(moses_dataset["SMILES"])

In [5]:
idf =[0]*len(tokenizer) 
tf_dict = {}
for k in vocab.keys():
    for l in moses_dataset["SMILES"]:
        if k in l:
            idf[vocab[k]] +=1
        
idf = [np.log(D/k) if k!=0 else 0 for k in idf]


In [6]:
model = GPT2LMHeadModel.from_pretrained("../models/trained_moses_ClearSMILES_corrected_character_level/2/final_model")
model.resize_token_embeddings(len(tokenizer))
loss_list = []
weights = [ idf[k] for k in range(len(idf))]
criterion_weighted = nn.CrossEntropyLoss(weight=torch.tensor(weights))
for k in tqdm(moses_dataset["SMILES"][1000:2000]):
    x = tokenizer(k,return_tensors='pt')
    inputs = x["input_ids"].squeeze()
    model = model.double()

    y = model(x["input_ids"].squeeze()).logits

    loss_weighted = criterion_weighted(y,x["input_ids"].squeeze())
    loss_list.append(loss_weighted)
m = 0
for k in loss_list:
    m+=k/len(loss_list)
m

100%|██████████| 1000/1000 [00:14<00:00, 70.10it/s]


tensor(11.3874, dtype=torch.float64, grad_fn=<AddBackward0>)

In [8]:
for l in tqdm(moses_dataset["SMILES"]):
    tff = [0] * len(tokenizer)
    splitted = tokenizer.tokenize(l)
    for k in splitted:
        tff[vocab[k]]+=1
    tf_dict[l] = tff

  2%|▏         | 32495/1936962 [00:02<02:33, 12380.69it/s]


KeyboardInterrupt: 

In [9]:
config =   GPT2Config(
            vocab_size=len(tokenizer),  # 10,000 tokens( pour BEP )
            n_positions=tokenizer.model_max_length , # ça ne génèrera que des smiles de la même taille
            n_ctx=tokenizer.model_max_length,  # ça ne génèrera que des smiles de la même taille
            n_embd=256,
            n_layer=8,
            n_head=8,
            resid_pdrop=0.1,
            embd_pdrop=0.1,
            attn_pdrop=0.1
        )

model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(tokenizer))

Embedding(27, 256)

100%|██████████| 1000/1000 [00:16<00:00, 60.90it/s]


In [None]:
from transformers import AutoModel



Embedding(27, 256)

tensor(2.3600, dtype=torch.float64, grad_fn=<AddBackward0>)

In [17]:
y.shape

torch.Size([38, 27])

CrossEntropyLoss()

In [96]:
y

tensor([[-8.5122e-02, -3.4772e-01,  4.5261e-01, -8.1870e-02, -1.3203e-01,
          1.2907e-01,  3.2802e-01, -9.0391e-02, -4.1565e-02, -8.8349e-02,
          5.6873e-01, -8.8659e-02, -1.2906e-01,  2.7534e-01, -7.7410e-02,
         -2.0390e-01, -2.3121e-02,  1.5054e-01,  5.2306e-01, -7.0487e-01,
          1.4264e+00,  5.3454e-01,  3.1366e-01,  4.2304e-03,  3.2647e-02,
         -4.9451e-01,  1.5823e-01],
        [-4.6750e-02, -1.7585e-02,  6.6924e-01, -2.0200e-01,  2.9417e-01,
          1.7454e-01,  2.9806e-01,  5.5484e-01, -2.6314e-02, -3.7302e-02,
          3.2599e-01, -3.2211e-01, -3.1896e-01,  7.0991e-01,  1.3410e-01,
         -1.7377e-01, -7.0795e-02, -2.6614e-02,  3.2698e-01, -4.6389e-01,
          6.8026e-01,  1.1262e-01,  1.5179e+00, -1.5062e-02,  4.1233e-02,
          4.0037e-04, -2.3126e-01],
        [-1.2703e-01, -4.3914e-01,  6.5779e-01,  1.6012e-01, -2.4623e-01,
          1.0343e-01,  1.5174e-02,  4.5682e-01, -9.1406e-02, -1.9279e-01,
          4.5841e-01, -1.7988e-01, -7.52

In [None]:




"""
criterion_weighted_manual = nn.CrossEntropyLoss(weight=weights, reduction='none')
loss_weighted_manual = criterion_weighted_manual(x, target)
loss_weighted_manual = loss_weighted_manual.sum() / weights[target].sum()

print(loss_weighted == loss_weighted_manual)"""

"\ncriterion_weighted_manual = nn.CrossEntropyLoss(weight=weights, reduction='none')\nloss_weighted_manual = criterion_weighted_manual(x, target)\nloss_weighted_manual = loss_weighted_manual.sum() / weights[target].sum()\n\nprint(loss_weighted == loss_weighted_manual)"

tensor(2.3480, dtype=torch.float64, grad_fn=<NllLossBackward0>)

In [102]:
x

{'input_ids': tensor([[20, 22, 22, 22,  4,  9, 18, 26, 11, 24, 13, 24, 24, 24,  1, 15, 24,  9,
         18,  2, 22,  9, 18, 26, 11, 26, 22, 11, 15, 24,  1, 24, 13, 16]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [7]:
target

tensor([2, 0, 2, 1, 3, 0, 1, 3, 1, 1])

In [19]:
M= torch.randn((int(1.6*10**6),16))
xx = torch.randn(16)

In [27]:
M.shape

torch.Size([1600000, 16])

In [28]:
torch.matmul(M,xx).shape

torch.Size([1600000])

In [8]:
criterion_weighted

CrossEntropyLoss()

In [9]:
loss_weighted 

tensor(1.7757)

In [10]:
loss_weighted_manual

tensor(1.7757)

In [11]:
loss_weighted_manual

tensor(1.7757)

In [12]:
criterion_weighted_manual(x, target)

tensor([8.4030, 1.7097, 3.8246, 3.3353, 9.3614, 1.9125, 2.3251, 7.5274, 1.8636,
        2.3529])