# **Text Summarizer with LLaMA.**

In [None]:
!pip install evaluate
!pip install rouge_score

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 guardrail-ml==0.0.12 tensorboard
!apt-get -qq install poppler-utils tesseract-ocr
!pip install -q unstructured["local-inference"]==0.7.4 pillow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import re
import sys
import json
import torch
import wandb
import torch
import itertools
import numpy as np

from collections import Counter
from datasets import Dataset, DatasetDict

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

from peft import LoraConfig

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# **Load Dataset.**

In [None]:
def json_to_dataset(dataset_path):
    """
    Load JSON file into HuggingFace Dataset.
    """
    with open(dataset_path, "r", encoding="utf-8") as f:
        dataset = json.load(f)
    dataset = Dataset.from_list(dataset)
    return dataset


# Create a unified dataset
train_path = "/content/train.json"
valid_path = "/content/val.json"
test_path = "/content/test.json"

train_dataset = json_to_dataset(train_path)
valid_dataset = json_to_dataset(valid_path)
test_dataset = json_to_dataset(test_path)

dataset = DatasetDict({
    "train": train_dataset,
    "valid": valid_dataset,
    "test": test_dataset
})

# **Preprocess Dataset.**

In [None]:
def remove_short_texts(dataset, dataset_label, min_length=2):
    """
    Remove samples where the length of dialogue or summary is < min_length.
    """
    data = dataset[dataset_label]
    ids = data["id"]
    dialogues = data["dialogue"]
    summaries = data["summary"]
    dialogue_lengths = [len(dialogue.split()) for dialogue in dialogues]
    summary_lengths = [len(summary.split()) for summary in summaries]
    # Identify samples with dialogue or summary length <= min_length
    valid_indices = [
        i for i, (dialogue_len, summary_len) in enumerate(zip(dialogue_lengths, summary_lengths))
        if dialogue_len >= min_length and summary_len >= min_length
    ]
    filtered_ids = [ids[i] for i in valid_indices]
    filtered_dialogues = [dialogues[i] for i in valid_indices]
    filtered_summaries = [summaries[i] for i in valid_indices]
    # Create new dataset with filtered samples
    filtered_data = {
        "id": filtered_ids,
        "dialogue": filtered_dialogues,
        "summary": filtered_summaries
    }
    filtered_data = Dataset.from_dict(filtered_data)
    print(f"Filtered '{dataset_label}' dataset contains {len(filtered_dialogues)} samples.")
    return filtered_data


# Remove short texts (length <= 1)
filtered_train_dataset = remove_short_texts(dataset, "train")
filtered_valid_dataset = remove_short_texts(dataset, "valid")
filtered_test_dataset = remove_short_texts(dataset, "test")

filtered_dataset = DatasetDict({
    "train": filtered_train_dataset,
    "valid": filtered_valid_dataset,
    "test": filtered_test_dataset
})

Filtered 'train' dataset contains 14729 samples.
Filtered 'valid' dataset contains 818 samples.
Filtered 'test' dataset contains 819 samples.


In [None]:
def preprocess_texts(dataset, dataset_label):
    """
    Preprocess the dialogue and summary texts by:
    - Replacing \r with '' to remove carriage returns.
    - Replacing \n with <speaker> to mark changes in speaker.
    - Removing extra spaces.
    - Stripping leading/trailing spaces.
    - Converting to lowercase.
    """
    data = dataset[dataset_label]
    ids = data["id"]
    dialogues = data["dialogue"]
    summaries = data["summary"]
    processed_dialogues = [
        re.sub(r'\s+', ' ', re.sub(r"\r", "", re.sub(r"\n", " <speaker> ", dialogue))).strip().lower()
        for dialogue in dialogues
    ]
    processed_summaries = [summary.strip().lower() for summary in summaries]
    # Create new dataset with filtered samples
    filtered_data = {
        "id": ids,
        "dialogue": processed_dialogues,
        "summary": processed_summaries
    }
    filtered_data = Dataset.from_dict(filtered_data)
    print(f"Preprocessing completed for '{dataset_label}' dataset.")
    return filtered_data


# Preprocess texts
filtered_train_dataset = preprocess_texts(filtered_dataset, "train")
filtered_valid_dataset = preprocess_texts(filtered_dataset, "valid")
filtered_test_dataset = preprocess_texts(filtered_dataset, "test")

filtered_dataset = DatasetDict({
    "train": filtered_train_dataset,
    "valid": filtered_valid_dataset,
    "test": filtered_test_dataset
})

Preprocessing completed for 'train' dataset.
Preprocessing completed for 'valid' dataset.
Preprocessing completed for 'test' dataset.


# **LLaMA**

## Base LLaMA.

In [None]:
# Used for multi-gpu
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
max_seq_length = None

# The model that you want to train from the Hugging Face hub
model_name = "guardrail/llama-2-7b-guanaco-instruct-sharded"

# Fine-tuned model name
new_model = "llama-2-7b-guanaco-dialogue-summary"

# Activate 4-bit precision base model loading
use_4bit = True

# Activate nested quantization for 4-bit base models
use_nested_quant = False

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

In [None]:
def load_model(model_name):
    """
    Loads tokenizer and model with QLoRA configuration
    """
    compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )

    if compute_dtype == torch.float16 and use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            print("=" * 80)
            print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
            print("=" * 80)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=bnb_config
    )

    model.config.use_cache = False
    model.config.pretraining_tp = 1

    # Load LoRA configuration
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer, peft_config

model, tokenizer, peft_config = load_model(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/14 [00:00<?, ?it/s]

model-00001-of-00014.safetensors:   0%|          | 0.00/1.96G [00:00<?, ?B/s]

model-00002-of-00014.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00003-of-00014.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00004-of-00014.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00005-of-00014.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00006-of-00014.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00014.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00008-of-00014.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00009-of-00014.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00010-of-00014.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00011-of-00014.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00012-of-00014.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00013-of-00014.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

model-00014-of-00014.safetensors:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

In [None]:
def create_prompt(dialogue):
    """
    Creates necessary prompt for the model that suggests the creation of a summary based on the given dialogue.
    It is necessary to feed it to the model through the pipeline.
    """
    prompt = f"Summarize the following dialogue by giving me a text of what happens OVERALL without following the dialogue format. It must NOT exceed 2 sentences: '{dialogue}'"
    return prompt

In [None]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
def generate_summary(dialogue):
    """
    Cleans the model's response.
    - Deletes the prompt fed to the model.
    - Deletes unnecessary courtesy remarks such as 'Sure!, here's the summary:'
      by removing any text before ':'.
    """
    prompt = create_prompt(dialogue)
    result = pipe(f"<s>[INST] {prompt} [/INST]", max_length=1500, do_sample=True, top_p=0.9, temperature=0.7)
    generated_text = result[0]['generated_text']

    if "<s>[INST]" in generated_text and "[/INST]" in generated_text:
        start_idx = generated_text.index("<s>[INST]")
        end_idx = generated_text.index("[/INST]") + len("[/INST]")
        generated_text = generated_text[:start_idx] + generated_text[end_idx:]

    if ":" in generated_text:
        generated_text = generated_text.split(":", 1)[-1].strip()

    return generated_text

In [None]:
# Load ROUGE metric.
import evaluate
rouge_metric = evaluate.load("rouge")

In [None]:
val_list = list(filtered_dataset["valid"])
val_predictions = []
val_references = []

for example in val_list[:50]:
    dialogue = example["dialogue"]
    ref_summary = example["summary"]
    pred_summary = generate_summary(dialogue)
    val_predictions.append(pred_summary)
    val_references.append(ref_summary)

results = rouge_metric.compute(predictions=val_predictions, references=val_references)

for val in results:
    print(f"Valid {val}: {results[val]:.2f}")



Valid rouge1: 0.33
Valid rouge2: 0.13
Valid rougeL: 0.26
Valid rougeLsum: 0.26


In [None]:
test_list = list(filtered_dataset["test"])
test_predictions = []
test_references = []

for example in test_list[:50]:
    dialogue = example["dialogue"]
    ref_summary = example["summary"]
    pred_summary = generate_summary(dialogue)
    test_predictions.append(pred_summary)
    test_references.append(ref_summary)

results = rouge_metric.compute(predictions=test_predictions, references=test_references)

for val in results:
    print(f"Test {val}: {results[val]:.2f}")

Test rouge1: 0.34
Test rouge2: 0.13
Test rougeL: 0.26
Test rougeLsum: 0.26


In [None]:
# Given validation predictions. Helps illustrate the cleaned model's responses.
val_predictions

["Tom and his friend are discussing Tom's plan to get a puppy for his son. Tom's friend is hesitant at first, but eventually agrees to go with Tom to the animal shelter to help him choose a puppy that will be a good fit for his son.",
 "Emma is excited about an advent calendar she's found and wants to get one for her kids. Lauren reveals that advent calendars have evolved over the years and now contain a variety of items, including small toys, Christmas decorations, and notes asking children to do something nice for someone else.",
 "Overall, Jackie reveals that Madison is pregnant, but she doesn't want to talk about it, and Iggy is worried about the potential challenges and responsibilities that come with parenthood. Meanwhile, Jackie and Iggy reflect on their own experiences with immaturity and the difficulties they faced in their own relationships.",
 "Marla finds a pair of male underwear under her bed, and she and her friends try to figure out who it belongs to. They speculate abou

In [None]:
# Given test predictions. Helps illustrate the cleaned model's responses.
test_predictions

["Hannah asked Amanda for Betty's phone number, but Amanda couldn't find it. Amanda suggested Hannah ask Larry, who had called Betty last time they were at the park together.",
 "  Eric and Rob are discussing a stand-up comedy routine by a Russian comedian, Rob finds it funny and Eric agrees, they also discuss if it's the comedian's only stand-up and decide to watch some of his other performances on YouTube.",
 "Lenny asked Bob for help picking a pair of trousers, and Bob suggested sending photos of the options. Lenny showed Bob three photos, and Bob liked the first pair the most, but Lenny was unsure because he already had purple trousers and didn't want to have two pairs of the same color.",
 "Emma doesn't want Will to worry about dinner, as she's not feeling well and doesn't have an appetite. Will offers to pick her up and Emma declines, saying she'll be home soon and will let him know when she arrives.",
 "Ollie and Jane have a conversation where they make plans for lunch, check ea