<a href="https://colab.research.google.com/github/evinracher/3008410-intelligent-systems/blob/main/week2/exercise1/Post_Training_Quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
!{sys.executable} -m pip uninstall -y transformers tokenizers
!{sys.executable} -m pip install "transformers==4.40.2" "accelerate>=0.21.0" datasets evaluate sentencepiece sacrebleu

!{sys.executable} -m pip uninstall -y peft
!{sys.executable} -m pip install "peft==0.10.0"



Found existing installation: transformers 4.40.2
Uninstalling transformers-4.40.2:
  Successfully uninstalled transformers-4.40.2
Found existing installation: tokenizers 0.19.1
Uninstalling tokenizers-0.19.1:
  Successfully uninstalled tokenizers-0.19.1
Collecting transformers==4.40.2
  Using cached transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.2)
  Using cached tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.40.2-py3-none-any.whl (9.0 MB)
Using cached tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers, transformers
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.2.3 requires transformers<6.0.0,>=4.41.0, but you have

In [2]:
import transformers
print(transformers.__version__)


4.40.2


In [3]:
from transformers import Seq2SeqTrainer
print("✅ Seq2SeqTrainer import OK")


✅ Seq2SeqTrainer import OK


In [None]:
from transformers import (
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from datasets import Dataset
import evaluate
import numpy as np
import os
import pandas as pd

# --- Load dataset ---
folder_path = "/content"
dataset_name = "eng_small.csv"
path = os.path.join(folder_path, dataset_name)
data = Dataset.from_csv(path, encoding='utf-8')
data = data.train_test_split(test_size=0.1)

# --- Tokenizer & Model ---
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")  # PyTorch version

# --- Preprocessing ---
prefix = "translate: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["engl"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(text_target=examples["spa"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = data.map(preprocess_function, batched=True, remove_columns=["engl", "spa"])

# --- Data Collator ---
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# --- Metrics (BLEU) ---
bleu = evaluate.load("bleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    print("Preds shape:", preds.shape)
    print("Labels shape:", labels.shape)

    if isinstance(preds, tuple):
        preds = preds[0]

    if preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    try:
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    except Exception as e:
        print("Decoding error:", e)
        decoded_preds = [""] * len(preds)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Remove empty predictions or references
    filtered_preds = []
    filtered_refs = []
    for pred, ref in zip(decoded_preds, decoded_labels):
        if pred.strip() and ref.strip():
            filtered_preds.append(pred)
            filtered_refs.append([ref])

    if not filtered_preds:
        print("No valid predictions to evaluate.")
        return {"bleu": 0.0}

    result = bleu.compute(predictions=filtered_preds, references=filtered_refs)
    return {"bleu": result["bleu"]}


# --- Training arguments ---
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    predict_with_generate=True,
    generation_max_length=128,  # ← Added to avoid generation warnings
    logging_dir="./logs",
    report_to="none"
)

# --- Trainer ---
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# --- Train ---
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/18096 [00:00<?, ? examples/s]

Map:   0%|          | 0/2011 [00:00<?, ? examples/s]

  super().__init__(loader)


Epoch,Training Loss,Validation Loss


In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the fine-tuned model
model_fp32 = AutoModelForSeq2SeqLM.from_pretrained("./results/trained_model")
tokenizer = AutoTokenizer.from_pretrained("./results/trained_model")

# Apply dynamic quantization
model_quantized = torch.quantization.quantize_dynamic(
    model_fp32, {torch.nn.Linear}, dtype=torch.qint8
)

# Save quantized model state_dict
torch.save(model_quantized.state_dict(), "./results/quantized_model/quantized_model_state_dict.pt")

# Save model config and tokenizer for later reload
model_quantized.config.save_pretrained("./results/quantized_model")
tokenizer.save_pretrained("./results/quantized_model")

print("✅ Quantized model and tokenizer saved.")


Compare Size of the models

In [None]:
import os

def get_file_size(path):
    size = os.path.getsize(path) / (1024 ** 2)  # Convert bytes to MB
    print(f"{path} size: {size:.2f} MB")
    return size

# Original model (multiple PyTorch weights shards or single file)
original_model_path = "./results/trained_model/model.safetensors"
quantized_model_path = "./results/quantized_model/quantized_model.pt"

get_file_size(original_model_path)
get_file_size(quantized_model_path)


Compare inferences and time

In [None]:
import torch
import time
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = torch.device("cpu")

# Load model and tokenizer
model_fp32 = AutoModelForSeq2SeqLM.from_pretrained("./results/trained_model").to(device)
tokenizer = AutoTokenizer.from_pretrained("./results/trained_model")
model_fp32.eval()

# Quantize dynamically in-memory
model_int8 = torch.quantization.quantize_dynamic(
    model_fp32, {torch.nn.Linear}, dtype=torch.qint8
)
model_int8.eval()

# Inference input
text = "it is good to have friends"
inputs = tokenizer(text, return_tensors="pt").to(device)

# Inference function
def run_inference(model, label):
    start = time.time()
    with torch.no_grad():
        output = model.generate(**inputs, max_length=28)
    elapsed = time.time() - start
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"{label} Time: {elapsed:.3f}s | Output: {decoded}")
    return decoded, elapsed

print("\n Comparing Inference...")
out_fp32, t_fp32 = run_inference(model_fp32, "FP32")
out_int8, t_int8 = run_inference(model_int8, "INT8 Quantized")

# Optional comparison summary
print("\n Output Match" if out_fp32 == out_int8 else "\n Output Slightly Differ")
print(f"\n FP32 Time: {t_fp32:.3f}s | INT8 Time: {t_int8:.3f}s")


## Exercise

- Quantize the model to fp16 and write the new size of the model

In [None]:
import os
from pathlib import Path
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def get_dir_size_mb(dir_path: str) -> float:
    p = Path(dir_path)
    total = 0
    for f in p.rglob("*"):
        if f.is_file():
            total += f.stat().st_size
    return total / (1024 * 1024)

# Load fine-tuned model (FP32)
model_fp32 = AutoModelForSeq2SeqLM.from_pretrained("./results/trained_model")
tokenizer = AutoTokenizer.from_pretrained("./results/trained_model")

# Convert to FP16
model_fp16 = model_fp32.half()

# Save FP16 model
fp16_dir = "./results/fp16_model"
os.makedirs(fp16_dir, exist_ok=True)
model_fp16.save_pretrained(fp16_dir)
tokenizer.save_pretrained(fp16_dir)

# Measure sizes
fp32_size_mb = get_dir_size_mb("./results/trained_model")
fp16_size_mb = get_dir_size_mb(fp16_dir)

print(f"FP32 model size: {fp32_size_mb:.2f} MB")
print(f"FP16 model size: {fp16_size_mb:.2f} MB")
print(f"Reduction: {((1 - fp16_size_mb/fp32_size_mb) * 100):.1f}%")
