In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk import sent_tokenize

In [2]:
validation_df = pd.read_csv('../data/validation.csv')
train_df = pd.read_csv('../data/train.csv')

In [3]:
train_df['text_sents'] = train_df['text'].apply(lambda x: sent_tokenize(x, language='french'))
validation_df['text_sents'] = validation_df['text'].apply(lambda x: sent_tokenize(x, language='french'))

In [4]:
device="cuda" if torch.cuda.is_available() else "cpu"

In [5]:
from sentence_transformers import SentenceTransformer
encoder_model =  SentenceTransformer("dangvantuan/sentence-camembert-base").to(device)

2024-03-11 21:18:10.244814: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-11 21:18:10.244834: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-11 21:18:10.245465: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-11 21:18:10.249526: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
PAD = torch.tensor(encoder_model.encode(["<pad>"]), device=device)
def pad(embedding, max_sents=10):
    n, _ = embedding.shape
    t = PAD.repeat(max_sents, 1)
    if n < max_sents:
        t[:n, :] = embedding
    else:
        t = embedding[:max_sents]
    return t

In [7]:
from transformers import GPT2LMHeadModel, GPT2Config
from transformers import AdamW
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [8]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [9]:
tokenizer.encode("hey there")

[20342, 612]

In [17]:

class CustomDataset(Dataset):
    def __init__(self, dataframe, features, target):
        self.data = dataframe
        self.features = features
        self.target = target

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        x = torch.tensor(encoder_model.encode(self.data.iloc[idx][self.features]))
        
        x = pad(x).to(device).view(-1)
        y = tokenizer.encode(
            self.data.iloc[idx][self.target], 
            add_special_tokens=True, 
            return_tensors='pt', 
            max_length=128, 
            truncation=True,
            padding='max_length'
        ).to(device).view(-1)
        return x, y
    
train_dataset = CustomDataset(train_df, features='text_sents', target='titles')
validate_dataset = CustomDataset(validation_df, features='text_sents', target='titles')

train_data_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
validate_data_loader = DataLoader(validate_dataset, shuffle=False)

In [11]:
for input, output in train_data_loader:
    print(input.device, output.device)
    print(input.shape, output.shape)
    break

cuda:0 cuda:0
torch.Size([4, 7680]) torch.Size([4, 128])


In [12]:
class CustomGPTModel(torch.nn.Module):
    def __init__(self, gpt_model, input_dim, out_dim):
        super().__init__()
        self.out_dim = out_dim
        self.input_layer = nn.Linear(input_dim, gpt_model.config.n_embd * out_dim)
        self.gpt_model = gpt_model

    def forward(self, tensor_input, labels=None):
        inputs_embeds = self.input_layer(tensor_input).view(-1, self.out_dim, self.gpt_model.config.n_embd)
        outputs = self.gpt_model(inputs_embeds=inputs_embeds, labels=labels)
        return outputs


In [14]:
input_dim = 10 * 768  
meta_model = CustomGPTModel(model, input_dim, 128).to(device)
optimizer = AdamW(meta_model.parameters(), lr=5e-5)



In [15]:
num_epochs = 1
for epoch in range(num_epochs):

    model.train()
    for tensor_inputs, labels in train_data_loader:
        outputs = meta_model(tensor_inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Loss: {loss.item()}")

Loss: 9.258627891540527
Loss: 11.886560440063477
Loss: 4.670105934143066
Loss: 7.480152606964111
Loss: 4.1384100914001465
Loss: 5.203574180603027
Loss: 5.157444000244141
Loss: 5.10637903213501
Loss: 4.794711589813232
Loss: 5.479889392852783
Loss: 3.6968765258789062
Loss: 4.523988246917725
Loss: 5.093047142028809
Loss: 4.776523590087891
Loss: 4.867101669311523
Loss: 4.45011043548584
Loss: 4.2588982582092285
Loss: 4.104259490966797
Loss: 4.405608654022217
Loss: 3.807468891143799
Loss: 6.391457557678223
Loss: 4.0104217529296875
Loss: 4.139003276824951
Loss: 5.075193405151367
Loss: 4.883889198303223
Loss: 4.921179294586182
Loss: 5.529850006103516
Loss: 4.821939468383789
Loss: 4.362810134887695
Loss: 3.916121244430542
Loss: 5.106486797332764
Loss: 3.1093239784240723
Loss: 4.352606296539307
Loss: 3.880260705947876
Loss: 3.8918681144714355
Loss: 4.9801554679870605
Loss: 2.9027678966522217
Loss: 3.7312123775482178
Loss: 3.1446173191070557
Loss: 3.320918560028076
Loss: 4.670928478240967
Loss: 4

KeyboardInterrupt: 

In [26]:
num_epochs = 1
model.eval()
outs = []
for tensor_inputs, labels in validate_data_loader:
    outputs = meta_model(tensor_inputs, labels=labels)
    logits=outputs.logits
    toks = torch.argmax(logits,dim=-1)
    out=tokenizer.batch_decode(toks)[0]
    outs.append(out.replace("<|endoftext|>", ""))


In [27]:
outs

["' de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de",
 "' de de de deéééééééééééééééééé deééé deééé de de de de",
 "' de de de de'''''''''''''''''''''' de'''' de de de de' de de'",
 "' de de de deéééééééé de de de de de de de de de de de de de de de de de de de de",
 "'é de de de'éé de de'éééééé'ééé deé' de'''é''' de' de de' de",
 "' de de de de''' de de'''''''''' de' de' de de'''''' de' de''",
 "'ist de de deéé deéé'''' de de de'' de'' de' de de de de de de de de de de' de' de de de' de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de",
 "' de de de de'''''''''''''''''''''' de''' de de''''' de",
 "'é deéééééééééééééééééééééééééééééééééé",
 "' de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de",
 "' de de de de'é'éé''''''é''é'éé'é de de'é' de de de' de'''",
 "' de de de de de de de de de de de d