In [1]:
# !git clone https://github.com/guilevieiram/title-generation.git
# !pip install datasets
# !pip install git+https://github.com/guilevieiram/title-generation.git
# !pip install transformers[torch]
# !pip install evaluate
# !pip install trl
# !pip install peft

In [1]:
from datasets import load_dataset

import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import evaluate

from peft import LoraConfig, get_peft_model

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")
nltk.download('stopwords')

2024-03-14 20:34:16.768691: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 20:34:16.768729: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 20:34:16.961711: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-14 20:34:17.359185: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data] 

True

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
finetune_att=True
finetune_lin=True
crop=0.3
batch_size = 2
num_train_epochs = 5
max_input_length = 1024
max_target_length = 256
r=32
model_checkpoint = "moussaKam/mbarthez"
rouge_score = evaluate.load("rouge")

In [4]:
dataset = load_dataset('csv', data_files={'train': '../data/train.csv', 'validation': '../data/validation.csv'})
train_sample_size = int(0.3 * len(dataset['train']))
validation_sample_size = int(0.3 * len(dataset['validation']))
dataset['train'] = dataset['train'].shuffle().select(range(train_sample_size))
dataset['validation'] = dataset['validation'].shuffle().select(range(validation_sample_size))


In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
print_trainable_parameters(model)
model.to(device)

trainable params: 139221504 || all params: 139221504 || trainable%: 100.0


MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(50002, 768, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): Embedding(50002, 768, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,)

In [7]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["titles"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/6420 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

In [8]:
att_target_modules = ["q_proj", "k_proj", "v_proj", "out_proj"]
import re
pattern = r'\((\w+)\): Linear'
linear_layers = re.findall(pattern, str(model.modules))
lin_target_modules = list(set(linear_layers))

target_modules = []
if finetune_att: target_modules.extend(att_target_modules)
if finetune_lin: target_modules.extend(lin_target_modules)

lora_config = LoraConfig(
    r=r,
    target_modules = target_modules,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 8112704 || all params: 147334208 || trainable%: 5.50632749184765


In [9]:
model_name = model_checkpoint.split("/")[-1]


In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return {k: round(v, 4) for k, v in result.items()}

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [11]:
trainer.evaluate()



{'eval_loss': 2.162757396697998,
 'eval_rouge1': 0.177,
 'eval_rouge2': 0.0506,
 'eval_rougeL': 0.1352,
 'eval_rougeLsum': 0.1433,
 'eval_runtime': 293.4573,
 'eval_samples_per_second': 1.533,
 'eval_steps_per_second': 0.767}

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss




In [None]:
def predict(text, model):
    text=f"summarize: {text} \n\n "
    tokens = tokenizer(text, return_tensors='pt').to(device)
    output_tokens=model.generate(**tokens, max_new_tokens=max_target_length)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

def print_summary(idx, model):
    review = dataset["validation"][idx]["text"]
    title = dataset["validation"][idx]["titles"]
    text = dataset["validation"][idx]['text']
    summary = predict(text, model)
    print(f"'>>> Text: {review}'")
    print(f"\n'>>> Title: {title}'")
    print(f"\n'>>> predicted: {summary}'")

In [None]:
print_summary(1, model)

'>>> Text: De retour de Nouakchott dans la nuit, le chef de l'Etat poursuit ses consultations en vue du remaniement, sans laisser filtrer ses intentions, même auprès de son Premier ministre Edouard Philippe dont le sort est en suspens. Sa décision n'est pas encore prise, selon ses proches, et son verdict devrait tomber d'ici le début de la semaine prochaine, peut-être dès ce jeudi ou ce vendredi.En annonçant lundi de nouvelles ambitions pour la transition écologique, après avoir promis mi-juin une politique plus sociale ou en suggérant d'appliquer à d'autre sujets la méthode de la Convention citoyenne avec des citoyens tirés au sort, le chef de l'Etat a dessiné un virage qui pourrait être incompatible avec Edouard Philippe. Ce dernier bénéficie d'une popularité en hausse, d'une nette victoire aux municipales au Havre et continue à jouer un rôle-clé dans la conquête de l'électorat de centre-droit. Mais les sujets de dissensions avec le chef de l'Etat demeurent, notamment les aspects fin

In [None]:
trainer.save_model("/Data/barthez-finetune2")



# making prediction

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained("/Data/barthez-finetune2")
tokenizer = AutoTokenizer.from_pretrained("/Data/barthez-finetune2")
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(50002, 768, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): Embedding(50002, 768, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=32, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=32, out_features=768, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
            )
            (v_proj): lo

In [6]:
csv_file_path = '../data/test_text.csv'
dataset = load_dataset('csv', data_files={"data":csv_file_path})

def preprocess(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding='max_length',
        max_length=max_input_length
      )

tokenized_datasets = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [7]:
batch_size = 8 
data_loader = DataLoader(tokenized_datasets['data'], batch_size=batch_size)

preds = []
model.eval()
with torch.no_grad():
    for bidx, batch in enumerate(data_loader):
        print(f"batch {bidx+1}/{len(data_loader)}")
        input_ids = torch.stack(batch['input_ids']).T.to(device)
        attention_mask = torch.stack(batch['attention_mask']).T.to(device)

        output_tokens = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_target_length)

        for idx, tokens in enumerate(output_tokens):
            out = tokenizer.decode(tokens, skip_special_tokens=True)
            preds.append({'ID': batch['ID'][idx].item(), "titles": out})

batch 1/188
batch 2/188
batch 3/188
batch 4/188
batch 5/188
batch 6/188
batch 7/188
batch 8/188
batch 9/188
batch 10/188
batch 11/188
batch 12/188
batch 13/188
batch 14/188
batch 15/188
batch 16/188
batch 17/188
batch 18/188
batch 19/188
batch 20/188
batch 21/188
batch 22/188
batch 23/188
batch 24/188
batch 25/188
batch 26/188
batch 27/188
batch 28/188
batch 29/188
batch 30/188
batch 31/188
batch 32/188
batch 33/188
batch 34/188
batch 35/188
batch 36/188
batch 37/188
batch 38/188
batch 39/188
batch 40/188
batch 41/188
batch 42/188
batch 43/188
batch 44/188
batch 45/188
batch 46/188
batch 47/188
batch 48/188
batch 49/188
batch 50/188
batch 51/188
batch 52/188
batch 53/188
batch 54/188
batch 55/188
batch 56/188
batch 57/188
batch 58/188
batch 59/188
batch 60/188
batch 61/188
batch 62/188
batch 63/188
batch 64/188
batch 65/188
batch 66/188
batch 67/188
batch 68/188
batch 69/188
batch 70/188
batch 71/188
batch 72/188
batch 73/188
batch 74/188
batch 75/188
batch 76/188
batch 77/188
batch 78

In [9]:
test_df=pd.DataFrame(preds)
test_df.to_csv("./submission.csv", columns=['ID', 'titles'], index=False)