In [1]:
import torch
import torch.nn as nn
import nltk
from nltk import sent_tokenize
from titlegen.score import avg_score
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import DataLoader
from transformers import BatchEncoding
from transformers import CamembertModel, CamembertTokenizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /users/eleves-b/2021/guilherme.vieira-
[nltk_data]     manhaes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
validation_df = pd.read_csv('../data/validation.csv')
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test_text.csv')

In [4]:
train_df['text_sents'] = train_df['text'].apply(lambda x: sent_tokenize(x, language='french'))
validation_df['text_sents'] = validation_df['text'].apply(lambda x: sent_tokenize(x, language='french'))
test_df['text_sents'] = test_df['text'].apply(lambda x: sent_tokenize(x, language='french'))

In [5]:
train_df['titles_sents'] = train_df['titles'].apply(lambda x: sent_tokenize(x, language='french'))
validation_df['titles_sents'] = validation_df['titles'].apply(lambda x: sent_tokenize(x, language='french'))

In [6]:
validation_df.head()

Unnamed: 0,text,titles,text_sents,titles_sents
0,"Sur les réseaux sociaux, les images sont impre...","Le bateau de croisière, long de 275 m, a percu...","[Sur les réseaux sociaux, les images sont impr...","[Le bateau de croisière, long de 275 m, a perc..."
1,La vidéo est devenue virale. Elle montre un po...,Le parquet de Paris a annoncé vendredi avoir o...,"[La vidéo est devenue virale., Elle montre un ...",[Le parquet de Paris a annoncé vendredi avoir ...
2,"Depuis la présidentielle, il est parfois un pe...","À Trappes (Yvelines), c'est désormais la star....","[Depuis la présidentielle, il est parfois un p...","[À Trappes (Yvelines), c'est désormais la star..."
3,"Routes endommagées, trains toujours perturbés,...",Un homme de 44 ans est porté disparu depuis sa...,"[Routes endommagées, trains toujours perturbés...",[Un homme de 44 ans est porté disparu depuis s...
4,Une enquête menée par le journal l'Obs.La nuit...,Son nom n'avait jusque-là jamais été cité dans...,[Une enquête menée par le journal l'Obs.La nui...,[Son nom n'avait jusque-là jamais été cité dan...


In [7]:
model_id="camembert/camembert-large"

In [8]:
class CamembertModelAdapted(CamembertModel):
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        **_,
    ):
        return super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask
            )

In [9]:
tokenizer = CamembertTokenizer.from_pretrained(model_id)
camembert = CamembertModelAdapted.from_pretrained(model_id).to(device)

In [10]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value", "key", "dense"],
    lora_dropout=0.01,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)

camembert = get_peft_model(camembert, config)
camembert.print_trainable_parameters()

trainable params: 3,555,328 || all params: 340,216,832 || trainable%: 1.0450182547111602


In [11]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, encoder, max_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.encoder = encoder
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        input_sentence = self.dataframe.iloc[idx]['text']
        output_sentence = self.dataframe.iloc[idx]['titles']

        input_tokens = self.tokenizer(
            input_sentence, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).to(device)
        input_tokens = BatchEncoding(
            {key: val.squeeze(0) for key, val in input_tokens.items()})
        
        output_tokens = self.tokenizer(
            output_sentence, 
            max_length=self.max_length,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            output_encoding = self.encoder.base_model.model.embeddings(output_tokens.input_ids)
            output_encoding = output_encoding.mean(axis=1).view(-1)
            
        return input_tokens, output_encoding

class EvalDataset(Dataset):
    def __init__(self, dataframe, tokenizer, encoder, max_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.encoder = encoder
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        input_sentences = self.dataframe.iloc[idx]['text_sents']
        input_sentence = self.dataframe.iloc[idx]['text']
        output_sentence = self.dataframe.iloc[idx]['titles']

        input_tokens = self.tokenizer(
            input_sentences, 
            max_length=self.max_length,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            input_encodings=self.encoder.base_model.model.embeddings(input_tokens.input_ids)
            input_encodings=input_encodings.mean(axis=1)

        input_tokens_virgin = self.tokenizer(
            input_sentence, 
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ).to(device)


        return input_sentences, output_sentence, input_encodings, input_tokens_virgin

In [12]:
batch_size = 8
train_df_sample = train_df.sample(5000).reset_index()
val_df_sample = validation_df.sample(500).reset_index()

train_dataloader = DataLoader(
    TextDataset(train_df_sample, tokenizer, camembert), 
    batch_size=batch_size, shuffle=True)

val_dataloader = DataLoader(
    TextDataset(val_df_sample, tokenizer, camembert), 
    batch_size=batch_size, shuffle=True)

validation_dataset = EvalDataset(val_df_sample, tokenizer, camembert)


In [13]:
for input_toks, outs in train_dataloader:
    print(input_toks.input_ids.shape,outs.shape)
    break
for input_toks, outs in val_dataloader:
    print(input_toks.input_ids.shape,outs.shape)
    break

for in_sents, out_sent, in_encoding, in_tokens in validation_dataset:
    print(len(in_sents), len(out_sent), in_encoding.shape, in_tokens.input_ids.shape)
    break


torch.Size([8, 512]) torch.Size([8, 1024])
torch.Size([8, 512]) torch.Size([8, 1024])
16 213 torch.Size([16, 1024]) torch.Size([1, 512])


In [14]:
class SentenceEmbedder(nn.Module):
    def __init__(self, inner_model, embedding_dim=1024):
        super(SentenceEmbedder, self).__init__()
        self.inner_model = inner_model
        self.pooling = nn.AdaptiveAvgPool1d(1)

        self.fc1 = nn.Linear(embedding_dim, embedding_dim)
        self.gelu = nn.GELU()
        self.fc2 = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.inner_model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        pooled_output = self.pooling(last_hidden_state.transpose(1, 2)).squeeze(-1)
        x = self.fc1(pooled_output)
        x = self.gelu(x)
        x = self.fc2(x)
        return x

In [15]:
model = SentenceEmbedder(camembert)
model.to(device) 

SentenceEmbedder(
  (inner_model): PeftModelForSeq2SeqLM(
    (base_model): LoraModel(
      (model): CamembertModelAdapted(
        (embeddings): CamembertEmbeddings(
          (word_embeddings): Embedding(32005, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): CamembertEncoder(
          (layer): ModuleList(
            (0-23): 24 x CamembertLayer(
              (attention): CamembertAttention(
                (self): CamembertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.01, inplace=False)
                    )
                    (lora_A): Module

In [16]:
def count_trainable_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [17]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.MSELoss()

def validate():
    sents = []
    titles = []
    l = 0
    model.eval()
    with torch.no_grad():
        for in_sents, out_sent, in_encoding, in_tokens in validation_dataset:
            out = model(**in_tokens).view(-1)
            inner =  in_encoding @ out
            best=in_sents[inner.argmax().item()]
            sents.append(best)
            titles.append(out_sent)

        for inputs, outs in val_dataloader:
            outputs = model(**inputs)
            loss = loss_fn(outputs, outs)
            l += loss.item()
    
    model.train()
    return avg_score(sents, titles), l/len(val_dataloader)

def train():
    num_epochs = 5
    log_steps = 100
    losses = []
    model.train()

    for epoch in range(num_epochs):
        epoch_loss = 0
        for step, (inputs, outs) in enumerate(train_dataloader):
            outputs = model(**inputs)
            loss = loss_fn(outputs, outs)
            loss.backward()
            epoch_loss += loss.item()
            optimizer.step()
            optimizer.zero_grad()
            if step % log_steps == 0:
                print(f"epoch {epoch+1}/{num_epochs}, step {step+1}/{len(train_dataloader)}, loss={loss.item():.4f}")
        
        score, val_loss = validate()
        losses.append(epoch_loss)
        print(f"""
        epoch {epoch+1}/{num_epochs}, 
            valloss={val_loss:.4f}, 
            trainloss={epoch_loss/len(train_dataloader):.4f}, 
            score={score:.4f}""")

In [18]:
validate()

(0.08938369009210473, 0.017157526923313973)

In [19]:
train()

epoch 1/5, step 1/625, loss=0.0189
epoch 1/5, step 101/625, loss=0.0081
epoch 1/5, step 201/625, loss=0.0040
epoch 1/5, step 301/625, loss=0.0080
epoch 1/5, step 401/625, loss=0.0051
epoch 1/5, step 501/625, loss=0.0045
epoch 1/5, step 601/625, loss=0.0040

        epoch 1/5, 
            valloss=0.0046, 
            trainloss=0.0064, 
            score=0.1450
epoch 2/5, step 1/625, loss=0.0042
epoch 2/5, step 101/625, loss=0.0047
epoch 2/5, step 201/625, loss=0.0050
epoch 2/5, step 301/625, loss=0.0052
epoch 2/5, step 401/625, loss=0.0053
epoch 2/5, step 501/625, loss=0.0057
epoch 2/5, step 601/625, loss=0.0058

        epoch 2/5, 
            valloss=0.0046, 
            trainloss=0.0051, 
            score=0.1448
epoch 3/5, step 1/625, loss=0.0046
epoch 3/5, step 101/625, loss=0.0048
epoch 3/5, step 201/625, loss=0.0050
epoch 3/5, step 301/625, loss=0.0046
epoch 3/5, step 401/625, loss=0.0056
epoch 3/5, step 501/625, loss=0.0051
epoch 3/5, step 601/625, loss=0.0051

        epoch 3/

In [22]:
val_df_sample = validation_df

val_dataloader = DataLoader(
    TextDataset(val_df_sample, tokenizer, camembert), 
    batch_size=batch_size, shuffle=True)

validation_dataset = EvalDataset(val_df_sample, tokenizer, camembert)
validate()

(0.14857079528089628, 0.00442312584763908)