In [None]:
# pip install torch transformers pytorch-lightning datasets seaborn plotly pandas

In [7]:
from pprint import pprint
import functools
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import AutoModelForSequenceClassification, CamembertForMaskedLM, AutoTokenizer, AutoConfig
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm.notebook import tqdm
import pandas as pd
from datasets import Dataset
import numpy as np
import random

# Set the random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [2]:
dataset = "data/PAWS-C-FR/"
data = {
    "train": dataset + "translated_train.tsv",
    "dev": dataset + "dev_2k.tsv",
    "test": dataset + "test_2k.tsv"
}


def load_process():
    train = pd.read_csv(data['train'], delimiter='\t', on_bad_lines='skip')
    dev = pd.read_csv(data['dev'], delimiter='\t', on_bad_lines='skip')
    test = pd.read_csv (data['test'], delimiter='\t', on_bad_lines='skip')

    train.drop(columns=['id'], inplace=True)
    dev.drop(columns=['id'], inplace=True)
    test.drop(columns=['id'], inplace=True)

    train.dropna(inplace=True)
    dev.dropna(inplace=True)
    test.dropna(inplace=True)

    train['label'] = train['label'].astype(int)
    dev['label'] = dev['label'].astype(int)
    test['label'] = test['label'].astype(int)

    return train, dev, test


train, dev, test = load_process()

# Shape of the data
print(f"Total train samples : {train.shape[0]}")
print(f"Total validation samples: {dev.shape[0]}")
print(f"Total test samples: {test.shape[0]}")

train_dataset = Dataset.from_pandas(train)
dev_dataset = Dataset.from_pandas(dev)
test_dataset = Dataset.from_pandas(test)

Total train samples : 49127
Total validation samples: 1988
Total test samples: 2000


In [3]:
tokenizer = AutoTokenizer.from_pretrained('camembert-base')

def tokenize_batch(samples, tokenizer):
    sentence_1 = [sample['sentence1'] for sample in samples]
    sentence_2 = [sample['sentence2'] for sample in samples]
    labels = torch.tensor([sample["label"] for sample in samples])
    str_labels = [sample["label"] for sample in samples]
    text = [[str(x), str(y)] for x,y in zip(sentence_1, sentence_2)]
    tokens = tokenizer(text, return_tensors="pt", padding='max_length', max_length = 128, truncation=True)

    return {"input_ids": tokens.input_ids, "attention_mask": tokens.attention_mask, "labels": labels, "str_labels": str_labels, "sentences": text}


In [4]:
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=16, 
    shuffle=True, 
    num_workers=12,
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)
val_dataloader = DataLoader(
    dev_dataset, 
    batch_size=16, 
    shuffle=False, 
    num_workers=12,
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)

test_dataloader = DataLoader(
    test_dataset, 
    batch_size=16, 
    shuffle=False, 
    num_workers=12,
    pin_memory=True,
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)

In [5]:
class LightningModel(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr, weight_decay, from_scratch=False):
        super().__init__()
        self.save_hyperparameters()
        if from_scratch:
            config = AutoConfig.from_pretrained(
                model_name, num_labels=num_labels
            )
            self.model = AutoModelForSequenceClassification.from_config(config)
        else:
            self.model = AutoModelForSequenceClassification.from_pretrained(
                model_name, num_labels=num_labels
            )
        self.lr = lr
        self.weight_decay = weight_decay
        self.num_labels = self.model.num_labels

    def forward(self, batch):
        return self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

    def training_step(self, batch):
        out = self.forward(batch)

        logits = out.logits
        # -------- MASKED --------
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits.view(-1, self.num_labels), batch["labels"].view(-1))

        # ------ END MASKED ------

        self.log("train/loss", loss)

        return loss

    def validation_step(self, batch, batch_index):
        labels = batch["labels"]
        out = self.forward(batch)

        preds = torch.max(out.logits, -1).indices
        # -------- MASKED --------
        acc = (batch["labels"] == preds).float().mean()
        # ------ END MASKED ------
        self.log("valid/acc", acc)

        f1 = f1_score(batch["labels"].cpu().tolist(), preds.cpu().tolist(), average="macro")
        self.log("valid/f1", f1)

    def predict_step(self, batch, batch_idx):
        out = self.forward(batch)

        return torch.max(out.logits, -1).indices

    def configure_optimizers(self):
        return torch.optim.AdamW(
            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )
    


lightning_model = LightningModel("camembert-base", 2, lr=3e-5, weight_decay=0.)
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="valid/acc", mode="max")

camembert_trainer = pl.Trainer(
    max_epochs=1,
    accelerator="gpu", devices="auto",
    callbacks=[
        pl.callbacks.EarlyStopping(monitor="valid/acc", patience=4, mode="max"),
        model_checkpoint,
    ]
)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

In [6]:
camembert_trainer.fit(lightning_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                               | Params
-------------------------------------------------------------
0 | model | CamembertForSequenceClassification | 110 M 
-------------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
442.494   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]



Epoch 0: 100%|██████████| 3071/3071 [33:17<00:00,  1.54it/s, v_num=2]      

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 3071/3071 [33:19<00:00,  1.54it/s, v_num=2]


In [115]:
def eval(model,trainer,test_set):
    preds = trainer.predict(model,dataloaders=test_set)
    preds = torch.cat(preds, -1)
    preds = preds.cpu().numpy()
    preds = preds.tolist()
    test['preds'] = preds
    test['preds'] = test['preds'].astype(int)
    test['label'] = test['label'].astype(int)
    print(f"Accuracy: {sum(test['preds'] == test['label'])/len(test)}")
    print(f"F1 score: {f1_score(test['preds'], test['label'], average='macro')}")


In [116]:
lightning_model = LightningModel.load_from_checkpoint(checkpoint_path=model_checkpoint.best_model_path)

eval(lightning_model,camembert_trainer,test_dataloader)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

Predicting DataLoader 0: 100%|██████████| 125/125 [00:28<00:00,  4.42it/s]
Accuracy: 0.906
F1 score: 0.9056935985015315


In [107]:
# Save the state dictionary of the model
torch.save(lightning_model.state_dict(), "camembert-prefinetuned.pt")

In [108]:
# Load the saved model
loaded_model = LightningModel("camembert-base", 2, lr=3e-5, weight_decay=0.)
loaded_state_dict = torch.load("camembert-prefinetuned.pt")
loaded_model.load_state_dict(loaded_state_dict)
loaded_model.to('cuda')
loaded_model.eval()

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

LightningModel(
  (model): CamembertForSequenceClassification(
    (roberta): CamembertModel(
      (embeddings): CamembertEmbeddings(
        (word_embeddings): Embedding(32005, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): CamembertEncoder(
        (layer): ModuleList(
          (0-11): 12 x CamembertLayer(
            (attention): CamembertAttention(
              (self): CamembertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): CamembertSelfOutput(
        

In [117]:
eval(loaded_model,camembert_trainer,test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 125/125 [00:28<00:00,  4.38it/s]
Accuracy: 0.906
F1 score: 0.9056935985015315


In [110]:
# Quantize model using dynamic quantization

quantized_model = torch.quantization.quantize_dynamic(
    loaded_model, {torch.nn.Linear}, dtype=torch.qint8
)
torch.save(quantized_model.state_dict(), 'models/camembert-prefinetuned-quantized.pt')

In [111]:
import os
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(loaded_model)
print_size_of_model(quantized_model)

Size (MB): 442.567241
Size (MB): 186.048485


In [112]:
# Load the saved model
device = torch.device("cuda")
loaded_quantized_model = LightningModel("camembert-base", 2, lr=3e-5, weight_decay=0.)
loaded_quantized_state_dict = torch.load("models/camembert-prefinetuned-quantized.pt",map_location=device)
loaded_quantized_model.load_state_dict(loaded_quantized_state_dict, strict=False)
loaded_quantized_model.to('cuda')

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

LightningModel(
  (model): CamembertForSequenceClassification(
    (roberta): CamembertModel(
      (embeddings): CamembertEmbeddings(
        (word_embeddings): Embedding(32005, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): CamembertEncoder(
        (layer): ModuleList(
          (0-11): 12 x CamembertLayer(
            (attention): CamembertAttention(
              (self): CamembertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): CamembertSelfOutput(
        

In [118]:
# Set the model to evaluation mode
loaded_quantized_model.eval()
eval(loaded_quantized_model,camembert_trainer,test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 125/125 [00:28<00:00,  4.38it/s]
Accuracy: 0.453
F1 score: 0.3253368690450495
