In [1]:
# !git clone https://github.com/guilevieiram/title-generation.git
# !pip install datasets
# !pip install git+https://github.com/guilevieiram/title-generation.git
# !pip install transformers[torch]
# !pip install evaluate
# !pip install trl
# !pip install peft

In [2]:
import os
os.environ["TRANSFORMERS_CACHE"] = "/Data/hfcache"

In [3]:
from datasets import load_dataset

import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from transformers import BitsAndBytesConfig

import evaluate

from peft import LoraConfig, get_peft_model

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")
nltk.download('stopwords')

2024-03-19 09:49:21.139583: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-19 09:49:21.139611: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-19 09:49:21.140239: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-19 09:49:21.144133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to
[nltk_data] 

True

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
finetune_att=True
finetune_lin=True


crop=1

r=128
num_train_epochs = 10

batch_size = 8
model_checkpoint = "bigscience/mt0-xl"

max_input_length = 1024
max_target_length = 64
model_name = f"{model_checkpoint.split('/')[1]}-r{r}-e{num_train_epochs}-c{crop}-quantune"
out_dir = f"/Data/{model_name}"
rouge_score = evaluate.load("rouge")


def prompt(text):
    return f"Creez un titre en français pour le texte suivant: {text}"

In [5]:
dataset = load_dataset('csv', data_files={'train': '../data/train.csv', 'validation': '../data/validation.csv'})
train_sample_size = int(crop * len(dataset['train']))
validation_sample_size = int(crop * len(dataset['validation']))
dataset['train'] = dataset['train'].shuffle().select(range(train_sample_size))
dataset['validation'] = dataset['validation'].shuffle().select(range(validation_sample_size))


In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print_trainable_parameters(model)


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 1528027136 || all params: 3742619648 || trainable%: 40.82774312416585


In [9]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        [prompt(text) for text in examples["text"]],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["titles"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/21401 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [10]:
att_target_modules = ["q", "k", "v", "o"]
import re
pattern = r'\((\w+)\): Linear'
linear_layers = re.findall(pattern, str(model.modules))
lin_target_modules = list(set(linear_layers))

target_modules = []
if finetune_att: target_modules.extend(att_target_modules)
if finetune_lin: target_modules.extend(lin_target_modules)

lora_config = LoraConfig(
    r=r,
    target_modules = target_modules,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
)
lora_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_2_SEQ_LM', inference_mode=False, r=128, target_modules={'o', 'wi_1', 'wo', 'k', 'wi_0', 'v', 'lm_head', 'q'}, lora_alpha=8, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False)

In [11]:
from peft import prepare_model_for_kbit_training
prepared_model=prepare_model_for_kbit_training(model)
peft_model=get_peft_model(prepared_model, lora_config)
print_trainable_parameters(peft_model)
peft_model

trainable params: 315392000 || all params: 4058011648 || trainable%: 7.772082176142635


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): MT5ForConditionalGeneration(
      (shared): Embedding(250112, 2048)
      (encoder): MT5Stack(
        (embed_tokens): Embedding(250112, 2048)
        (block): ModuleList(
          (0): MT5Block(
            (layer): ModuleList(
              (0): MT5LayerSelfAttention(
                (SelfAttention): MT5Attention(
                  (q): lora.Linear8bitLt(
                    (base_layer): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2048, out_features=128, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=128, out_features=2048, bias=False)
                    )
                    (lora_embedding_A): P

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    result = {k: round(v, 4) for k, v in result.items()}
    print(result)
    return result

args = Seq2SeqTrainingArguments(
    output_dir=out_dir,
    evaluation_strategy="epoch",
    # logging_steps=1000,
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    fp16=False,
    bf16=True,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    generation_max_length=max_target_length,
    max_grad_norm=1.0,
)

trainer = Seq2SeqTrainer(
    peft_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=peft_model, label_pad_token_id=tokenizer.pad_token_id),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [13]:
# trainer.evaluate()

In [14]:
trainer.train(resume_from_checkpoint=True)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [15]:
trainer.evaluate()

KeyboardInterrupt: 

In [16]:
peft_model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): MT5ForConditionalGeneration(
      (shared): Embedding(250112, 2048)
      (encoder): MT5Stack(
        (embed_tokens): Embedding(250112, 2048)
        (block): ModuleList(
          (0): MT5Block(
            (layer): ModuleList(
              (0): MT5LayerSelfAttention(
                (SelfAttention): MT5Attention(
                  (q): lora.Linear8bitLt(
                    (base_layer): Linear8bitLt(in_features=2048, out_features=2048, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2048, out_features=128, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=128, out_features=2048, bias=False)
                    )
                    (lora_embedding_A): P

In [17]:
def predict(text):
    tokens = tokenizer(text, return_tensors='pt').to(device)
    output_tokens=peft_model.generate(**tokens, max_new_tokens=max_target_length)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

def print_summary(idx):
    review = dataset["validation"][idx]["text"]
    title = dataset["validation"][idx]["titles"]
    text = prompt(dataset["validation"][idx]['text'])
    summary = predict(text)
    print(f"'>>> Text: {review}'")
    print(f"\n>>> Title: {title}")
    print(f"\n>>> predicted: {summary}")

In [18]:
print_summary(10)

'>>> Text: Mission: décentralisation. La Cour des comptes vient d'émettre ses préconisations et elle recommande notamment le rattachement du mythique "36" à la Direction centrale de la police. La fin de la mainmise de la préfecture de police de Paris (PP) sur sa police judiciaire, anciennement au 36 quai des Orfèvres et aujourd'hui dans l'ouest de Paris, est l'un des objectifs affichés par le ministère de l'Intérieur depuis plusieurs mois, mais elle fait l'objet d'âpres débats.Dans un rapport publié lundi, la Cour des comptes souligne un "chevauchement des compétences" entre les services parisiens et leurs homologues de la place Beauvau et "une concurrence entre les services". Conséquence: "L'existence parallèle de ces deux directions laisserait subsister une organisation peu efficiente", assène-t-elle.Le patron de la Direction centrale de la police judiciaire (DCPJ), Jérôme Bonet, a déjà exprimé publiquement le souhait de cette fusion en mai devant la commission d'enquête sur les moye

In [19]:
trainer.save_model(out_dir)



# making prediction

In [25]:
csv_file_path = '../data/test_text.csv'
test_dataset = load_dataset('csv', data_files={"data":csv_file_path})

def preprocess(examples):
    return tokenizer(
        [prompt(text) for text in examples["text"]],
        truncation=True,
        padding=True,
        max_length=max_input_length
      )

tokenized_test_datasets = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [26]:
data_loader = DataLoader(tokenized_test_datasets['data'], batch_size=batch_size)

preds = []
peft_model.eval()
with torch.no_grad():
    for bidx, batch in enumerate(data_loader):
        print(f"batch {bidx+1}/{len(data_loader)}")
        input_ids = torch.stack(batch['input_ids']).T.to(device)
        attention_mask = torch.stack(batch['attention_mask']).T.to(device)

        output_tokens = peft_model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_target_length)

        for idx, tokens in enumerate(output_tokens):
            out = tokenizer.decode(tokens, skip_special_tokens=True)
            preds.append({'ID': batch['ID'][idx].item(), "titles": out})

batch 1/188




batch 2/188
batch 3/188
batch 4/188
batch 5/188
batch 6/188
batch 7/188
batch 8/188
batch 9/188
batch 10/188
batch 11/188
batch 12/188
batch 13/188
batch 14/188
batch 15/188
batch 16/188
batch 17/188
batch 18/188
batch 19/188
batch 20/188
batch 21/188
batch 22/188
batch 23/188
batch 24/188
batch 25/188
batch 26/188
batch 27/188
batch 28/188
batch 29/188
batch 30/188
batch 31/188
batch 32/188
batch 33/188
batch 34/188
batch 35/188
batch 36/188
batch 37/188
batch 38/188
batch 39/188
batch 40/188
batch 41/188
batch 42/188
batch 43/188
batch 44/188
batch 45/188
batch 46/188
batch 47/188
batch 48/188
batch 49/188
batch 50/188
batch 51/188
batch 52/188
batch 53/188
batch 54/188
batch 55/188
batch 56/188
batch 57/188
batch 58/188
batch 59/188
batch 60/188
batch 61/188
batch 62/188
batch 63/188
batch 64/188
batch 65/188
batch 66/188
batch 67/188
batch 68/188
batch 69/188
batch 70/188
batch 71/188
batch 72/188
batch 73/188
batch 74/188
batch 75/188
batch 76/188
batch 77/188
batch 78/188
batch 7

In [27]:
test_df=pd.DataFrame(preds)


In [28]:
import csv
filename=f"./submission-{model_name}.csv"
test_df.to_csv(filename, columns=['ID', 'titles'], index=False, quoting=csv.QUOTE_NONNUMERIC)

In [29]:
new_first_line = 'ID,titles\n'

# Read the contents of the CSV file
with open(filename, 'r') as file:
    lines = file.readlines()

# Substitute the first line
lines[0] = new_first_line

# Write the modified content back to the file
with open(filename, 'w') as file:
    file.writelines(lines)