# Setup working directory

In [None]:
import os

if os.getcwd().split(os.sep)[-1] == 'notebooks':
    os.chdir('../')

# Install Dependencies

In [1]:
!pip install -q torch datasets nltk transformers

# Import Libraries

In [3]:
import torch
import datasets
from nltk.translate.bleu_score import corpus_bleu
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load Model

In [4]:
model_name = "danhtran2mind/viet-news-sum-mt5-small-finetune"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [6]:
# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo)

# Data Processing

In [5]:
# Load the dataset
dataset_name = "OpenHust/vietnamese-summarization"
dataset = datasets.load_dataset(dataset_name)

# Define a function to preprocess the dataset
def preprocess_function(examples):
    inputs = examples["Document"]
    targets = examples["Summary"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding in the loss
    model_inputs["labels"] = labels
    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Split the dataset into training and validation sets
tokenized_datasets = tokenized_datasets['train'].train_test_split(test_size=0.1)

# Rename the splits to 'train' and 'val'
tokenized_datasets['val'] = tokenized_datasets.pop('test')


README.md:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Kmeans_1024_new.csv:   0%|          | 0.00/32.3M [00:00<?, ?B/s]

Kmeans_512_new.csv:   0%|          | 0.00/25.0M [00:00<?, ?B/s]

Kmeans_512_token_new.csv:   0%|          | 0.00/29.1M [00:00<?, ?B/s]

bio_medicine.csv:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

herding_512_bio_medicine.csv:   0%|          | 0.00/25.1M [00:00<?, ?B/s]

herding_bio_medicine.csv:   0%|          | 0.00/32.3M [00:00<?, ?B/s]

herding_prompt_512_bio_medicine.csv:   0%|          | 0.00/29.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/74564 [00:00<?, ? examples/s]

# Bleu Score

## Define Inference functions

In [7]:
def preprocess_input(text):
    inputs = tokenizer(text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    # Move inputs to the same device as the model
    return {k: v.to(device) for k, v in inputs.items()}

# Define a function to generate the summary
def generate_summary(text):
    inputs = preprocess_input(text)
    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128,
            # min_length=50,
            # length_penalty=12.0, num_beams=4,
            early_stopping=True
        )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

## Generate Predictions

In [8]:
# Generate summaries for the validation set
val_predictions = []
val_references = []

for idx, example in enumerate(tokenized_datasets['val']):
    text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
    summary = generate_summary(text)
    val_predictions.append(summary)
    # Filter out -100 (padding) tokens before decoding
    filtered_labels = [token_id for token_id in example['labels'] if token_id != -100]
    val_references.append(tokenizer.decode(filtered_labels, skip_special_tokens=True)) # Append the reference as a string




## Bleu Score calculation

In [9]:
def blue_score(hypotheses, references):
    # Ensure the hypotheses and references are lists of strings
    hypotheses = [h.split() for h in hypotheses]
    references = [[r.split()] for r in references] # Corrected this line

    # Calculate the BLEU score
    bleu_score = corpus_bleu(references, hypotheses)

    # return bleu.score
    return bleu_score

# Calculate the BLEU score
bleu = blue_score(val_predictions, val_references)
print(f"BLEU Score on Validation Set: {bleu}")

BLEU Score on Validation Set: 0.9964783232500736
