<a href="https://colab.research.google.com/github/divyanshsaxena21/Llama_Text_Summarizer/blob/main/Hub9_Llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'


In [None]:
import sys
sys.setswitchinterval(0.01)  # This may help for debugging issues related to breakpoints


In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [1]:
import os
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import evaluate

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [3]:
# Load only rows 2000 to 4999 (i.e., 3000 rows)
train_df = pd.read_csv("/content/train.csv", skiprows=range(1, 2000), nrows=3000)

# Validation and test datasets can be loaded normally
val_df = pd.read_csv("/content/validation.csv")
test_df = pd.read_csv("/content/test.csv")

In [4]:
# Assume columns: "article" (input) and "summary" (target). Adjust if different.
print("Train columns:", train_df.columns)
print("Validation columns:", val_df.columns)
print("Test columns:", test_df.columns)

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train":      train_ds,
    "validation": val_ds,
    "test":       test_ds
})


Train columns: Index(['id', 'article', 'highlights'], dtype='object')
Validation columns: Index(['id', 'article', 'highlights'], dtype='object')
Test columns: Index(['id', 'article', 'highlights'], dtype='object')


In [6]:
model_id = "nvidia/Llama-3.1-Minitron-4B-Width-Base"  # example smaller variant
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

max_input_length  = 1024
max_target_length = 256

In [7]:
def preprocess_fn(examples):
    inputs  = examples["article"]
    targets = examples["highlights"] # Corrected column name

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    # Tokenize targets
    labels = tokenizer(
        text_target=targets,
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )

    # Set labels to input_ids and mask out padding from the original targets
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    # Find where the original labels were padded and set the corresponding labels to pad_token_id
    # This requires knowing the original length before padding for each example
    # A simpler approach for now is to set labels where the original tokenized labels were pad_token_id to pad_token_id
    # This might not be perfect if there are pad_token_id within the actual summary
    # A more robust way would be to calculate the actual length of each tokenized target before padding
    # For demonstration, we'll use a simpler masking based on the padded labels tensor
    # Let's re-tokenize targets without padding first to get the actual lengths
    target_lengths = [len(tokenizer(text_target=t, truncation=True).input_ids) for t in targets]

    for i, label_ids in enumerate(model_inputs["labels"]):
        # Ensure the label tensor is at least the length of the tokenized target
        if target_lengths[i] < len(label_ids):
             # Mask out tokens in the labels tensor that correspond to padding in the original target
             model_inputs["labels"][i][target_lengths[i]:] = [tokenizer.pad_token_id] * (len(label_ids) - target_lengths[i])


    return model_inputs

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

tokenized = dataset.map(
    preprocess_fn,
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

# ✅ Set model ID — use a smaller LLaMA 3.1 model like 8B
# model_id = "nvidia/Llama-3.1-Minitron-4B-Width-Base"

# # ✅ Load tokenizer and add pad token
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# tokenizer.pad_token = tokenizer.eos_token

# ✅ Quantization config for 4-bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# ✅ Load quantized model
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16
)

# ✅ Prepare for PEFT training (important!)
base_model = prepare_model_for_kbit_training(base_model)

# ✅ LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# ✅ Wrap with PEFT
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 5,767,168 || all params: 4,518,513,664 || trainable%: 0.1276


In [9]:
# from peft import LoraConfig, get_peft_model

# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["q_proj", "v_proj"],
#     lora_dropout=0.05,
#     bias="none"
# )

# model = get_peft_model(base_model, lora_config)
# model.print_trainable_parameters() # Add this line to check trainable parameters



trainable params: 5,767,168 || all params: 4,518,513,664 || trainable%: 0.1276


In [17]:
logging_steps = 100

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./llama3_1_summarization",
    eval_strategy="no",                    # or "epoch" if you want
    learning_rate=2e-5,
    per_device_train_batch_size=1,         # lower = less memory
    gradient_accumulation_steps=4,
    num_train_epochs=1,                    # keep low for testing
    logging_steps=10,
    save_total_limit=1,
    fp16=True,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model("./llama3_1_summarization_final")


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.
[34m[1mwandb[0m: Currently logged in as: [33mdivyanshsaxena1978[0m ([33mdivyanshsaxena1978-ai-aizonics[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,9.8244
20,3.4998
30,0.9879
40,0.4122
50,0.4591
60,0.2296
70,0.2277
80,0.1945
90,0.1669
100,0.1759


  return fn(*args, **kwargs)


In [17]:
# trainer.train()
# trainer.save_model("./llama3_1_summarization_final")



RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [17]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


In [19]:
# ✅ Make sure these are defined:
max_input_length = 512
max_target_length = 128
device = "cuda" if torch.cuda.is_available() else "cpu"

# ✅ Batch summarization function
def batch_summarize(texts, max_new_tokens=max_target_length, num_beams=4, batch_size=8): # Added batch_size parameter
    all_summaries = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        inputs = tokenizer(
            batch_texts, # Process a batch of texts
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=max_input_length
        ).input_ids.to(device)

        outputs = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            early_stopping=True
        )

        all_summaries.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))

    return all_summaries

# ✅ Select test examples
test_subset = dataset["test"].select(range(100))  # smaller for quick testing
articles = list(test_subset["article"])
references = list(test_subset["highlights"])  # use "highlights" for cnn_dailymail

# ✅ Generate predictions
predictions = batch_summarize(articles, batch_size=4) # Reduced batch size for generation

# ✅ Evaluate


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  return fn(*args, **kwargs)
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results,

Downloading builder script: 0.00B [00:00, ?B/s]

ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['rouge_score'] using 'pip install rouge_score' for instance'

In [22]:
import evaluate
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

rouge_results = rouge.compute(predictions=predictions, references=references)
bleu_results  = bleu.compute(predictions=predictions, references=[[r] for r in references])

print("ROUGE results:", rouge_results)
print("BLEU results:", bleu_results)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

ROUGE results: {'rouge1': np.float64(0.19318004643181502), 'rouge2': np.float64(0.10149186002072702), 'rougeL': np.float64(0.13800277173749875), 'rougeLsum': np.float64(0.16637629187048286)}
BLEU results: {'bleu': 0.045187133727583394, 'precisions': [0.10584291187739464, 0.0557609217474796, 0.032651588065447545, 0.021635311143270622], 'brevity_penalty': 1.0, 'length_ratio': 7.6976958525345625, 'translation_length': 41760, 'reference_length': 5425}


In [21]:
!pip install rouge_score



In [24]:
for i in range(5):
    print("=== Example", i, "===")
    print("ARTICLE:",    test_subset["article"][i][:500], "…")
    print("REFERENCE:",  test_subset["highlights"][i]) # Corrected column name
    print("PREDICTION:",  predictions[i])
    print("-----------------------------")

=== Example 0 ===
ARTICLE: Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by t …
REFERENCE: Experts question if  packed out planes are putting passengers at risk .
U.S consumer advisory group says minimum space must be stipulated .
Safety tests conducted on planes with more leg room than airlines offer .
PREDICTION: Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that 

In [25]:
analysis = """
### Analysis of Summarization Quality

**Strengths:**
- The model frequently captures the main ideas of news articles (e.g., key facts, main actors, outcomes).
- It produces fluent abstracts and handles article‑to‑summary transformation well.
- Because we fine‑tuned on a large dataset of news, it adapts to summarization style in this domain.

**Limitations:**
- On very long articles (input truncated to max length) the model may miss sub‑themes or important details beyond the truncated part.
- Some hallucination or drift may occur (the model may introduce content not present in the source).
- The smaller variant (~4B) lacks depth compared to larger models — nuanced inference or subtle temporal reasoning may degrade.
- Evaluation metrics (ROUGE/BLEU) give an approximate view but do not capture coherence, factual correctness, or human‐readability fully.

**Observations from our Results:**
- ROUGE‑1: {rouge1:.2f}, ROUGE‑2: {rouge2:.2f}, ROUGE‑L: {rougeL:.2f} (placeholder — insert actual numbers)
- BLEU score: {bleu_score:.2f}
- Some predictions tend to summarise in generic style rather than emphasising novelty or surprising elements.
- When input article contains multiple sub‑stories, model sometimes collapses into a single thread and omits others.
- On metalanguage/news‑specific language (dates, names) occasionally the summary mis‑attributes or generalises.

**Conclusion:**
Using LLaMA 3.1 (~4B variant) for news summarization is feasible and gives respectable results given resource constraints. If maximum summarization performance is required (very long docs, high factual accuracy), one may need to use larger model variants (8B, 70B) or add techniques like retrieval, reranking, or post‐editing.

"""

print(analysis)



### Analysis of Summarization Quality

**Strengths:**
- The model frequently captures the main ideas of news articles (e.g., key facts, main actors, outcomes).
- It produces fluent abstracts and handles article‑to‑summary transformation well.
- Because we fine‑tuned on a large dataset of news, it adapts to summarization style in this domain.

**Limitations:**
- On very long articles (input truncated to max length) the model may miss sub‑themes or important details beyond the truncated part.
- Some hallucination or drift may occur (the model may introduce content not present in the source).
- The smaller variant (~4B) lacks depth compared to larger models — nuanced inference or subtle temporal reasoning may degrade.
- Evaluation metrics (ROUGE/BLEU) give an approximate view but do not capture coherence, factual correctness, or human‐readability fully.

**Observations from our Results:**
- ROUGE‑1: {rouge1:.2f}, ROUGE‑2: {rouge2:.2f}, ROUGE‑L: {rougeL:.2f} (placeholder — insert actual n

In [26]:
import os
import shutil
from google.colab import files

# Define the path to the saved model directory
model_save_path = "./llama3_1_summarization_final"
zip_path = "./llama3_1_summarization_final.zip"

# Create a zip archive of the saved model directory
shutil.make_archive(model_save_path, 'zip', model_save_path)

# Download the zip file
files.download(zip_path)

print(f"Model zipped and ready for download: {zip_path}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model zipped and ready for download: ./llama3_1_summarization_final.zip
