# V 1.0

### Continue training last revision of https://huggingface.co/doublecringe123/bardt-large-cnn-dialoguesum-booksum-lora

In [1]:
class Config:
    
    max_length = 1024
    target_max_length = 512

    epochs = 6
    
    batch_size = 8

    model_preset = "doublecringe123/bardt-large-cnn-dialoguesum-booksum-lora"
    model_revision = "977b188e7970f1c88f167d30656f32a63787e68b"
    model_last_epoch = 6 
    
    save_frecuency = 2

    inp = 'input_content'
    target = 'target'

cfg = Config()

# At First, lets load datasets



In [2]:
! pip install -q --upgrade pip
! pip install -q transformers[torch]
! pip install -q -U transformers==4.38.2 datasets==2.18.0 evaluate rouge_score

import os
from kaggle_secrets import UserSecretsClient

os.environ["HF_TOKEN"] = UserSecretsClient().get_secret("HF_TOKEN")

try: 
    import wandb
    wandb.init(mode='disabled')
except: 
    ...

This is my code from [github repo](https://github.com/goin2crazy/multy-dataset/blob/main/main.py) 

In [3]:
! wget -O "mds.py" "https://raw.githubusercontent.com/goin2crazy/multy-dataset/main/main.py" 

--2024-04-14 09:46:10--  https://raw.githubusercontent.com/goin2crazy/multy-dataset/main/main.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2333 (2.3K) [text/plain]
Saving to: 'mds.py'


2024-04-14 09:46:10 (21.5 MB/s) - 'mds.py' saved [2333/2333]



# Prepare Dataset

In [4]:
from mds import NewDataset
dataset_params = {
    "knkarthick/dialogsum": ("dialogue", "summary"), 
    "doublecringe123/dialoguesum-npc-dialoguesum-stemmed-augmented": ('inp', 'target')
}

dataset = NewDataset(dataset_params, input_col_name = cfg.inp, target_col_name = cfg.target)

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 11.3M/11.3M [00:00<00:00, 52.0MB/s]
Downloading data: 100%|██████████| 442k/442k [00:00<00:00, 1.28MB/s]
Downloading data: 100%|██████████| 1.35M/1.35M [00:00<00:00, 2.62MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading readme:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 23.5M/23.5M [00:01<00:00, 15.0MB/s]
Downloading data: 100%|██████████| 1.18M/1.18M [00:00<00:00, 2.81MB/s]
Downloading data: 100%|██████████| 2.28M/2.28M [00:00<00:00, 6.94MB/s]


Generating train split:   0%|          | 0/59070 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7000 [00:00<?, ? examples/s]

# Loading PEFT Model (LoRA)


In [5]:
! pip install -q peft

In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel, PeftConfig

config = PeftConfig.from_pretrained(cfg.model_preset)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

lora_model = PeftModel.from_pretrained(model, cfg.model_preset, is_trainable=True)

lora_model.print_trainable_parameters() 

adapter_config.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

trainable params: 1,769,472 || all params: 408,059,904 || trainable%: 0.4336304504938569


In [7]:
from torch import nn
import torch 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lora_model.model = nn.DataParallel(lora_model.model)

# Tokenize Dataset

In [8]:
def preprocess_function(examples):
    try: 
        inputs = [doc for doc in examples[cfg.inp]]
        model_inputs = tokenizer(inputs, max_length=cfg.max_length, truncation=True)

        labels = tokenizer(text_target=examples[cfg.target], max_length=cfg.target_max_length, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    except TypeError as e:
        print(e)
        print(examples[cfg.inp])

dataset = dataset.map(preprocess_function, batched = True)
tokenized_train, tokenized_val, tokenized_test = dataset.splits

Map:   0%|          | 0/71530 [00:00<?, ? examples/s]

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/8500 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=cfg.model_preset)

2024-04-14 09:47:32.574607: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-14 09:47:32.574725: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-14 09:47:32.685442: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Define Metrics

In [10]:
import evaluate

rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

# Define training arguments

In [12]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [13]:
eps = cfg.epochs // cfg.save_frecuency

for i in range(eps): 
    i += 1
    
    print(f"{i}/{eps} Training Initiallization...")
    training_args = Seq2SeqTrainingArguments(
        output_dir="bardt-large-cnn-dialoguesum-booksum-lora",
        evaluation_strategy="epoch",
        save_strategy='no',
    #     save_safetensors = True,
    #     save_steps = 100, 
        learning_rate=2e-5,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        weight_decay=0.01,
        num_train_epochs=cfg.save_frecuency,
        predict_with_generate=True,

        fp16=True,
    )

    trainer = Seq2SeqTrainer(
        model=lora_model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print(f"{i}/{eps} start Training...")    
    trainer.train()
    
    print(f"{i}/{eps} Saving model...")    
    lora_model.save_pretrained("bardt-large-cnn-dialoguesum-booksum-lora")
    lora_model.push_to_hub("bardt-large-cnn-dialoguesum-booksum-lora", commit_message = f"Original+Augmented+Stemmed Dataset, {cfg.model_last_epoch + i * cfg.save_frecuency} epochs")

1/3 Training Initiallization...
1/3 start Training...


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.1787,1.304797,0.4015,0.194,0.3095,0.3096,61.0974
2,1.1683,1.291916,0.4032,0.1962,0.311,0.311,60.9871


1/3 Saving model...


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

2/3 Training Initiallization...
2/3 start Training...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.1518,1.294231,0.403,0.1962,0.3113,0.3113,60.87
2,1.1479,1.286063,0.4034,0.1967,0.3123,0.3124,60.9043


2/3 Saving model...


adapter_model.safetensors:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

3/3 Training Initiallization...
3/3 start Training...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.1291,1.289427,0.4031,0.196,0.3112,0.3112,60.9411
2,1.1311,1.282118,0.404,0.1977,0.3126,0.3125,60.8971


3/3 Saving model...


adapter_model.safetensors:   0%|          | 0.00/14.2M [00:00<?, ?B/s]