# Install Dependencies

In [1]:
!pip install -q datasets

# Import Libraries

In [3]:
import torch
import datasets
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load Base model

In [None]:
model_name = "google/mt5-small" 
tokenizer = T5Tokenizer.from_pretrained(model_name)  

config = T5ForConditionalGeneration.from_pretrained(model_name).config

model = T5ForConditionalGeneration.from_pretrained(model_name,config=config)

# Processing Dataset

## Split Dataset

In [None]:
# Load the dataset
dataset_name = "OpenHust/vietnamese-summarization"
dataset = datasets.load_dataset(dataset_name)

# Define a function to preprocess the dataset
def preprocess_function(examples):
    inputs = examples["Document"]
    targets = examples["Summary"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding in the loss
    model_inputs["labels"] = labels
    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Split the dataset into training and validation sets
tokenized_datasets = tokenized_datasets['train'].train_test_split(test_size=0.1)

# Rename the splits to 'train' and 'val'
tokenized_datasets['val'] = tokenized_datasets.pop('test')


## Save Processed Dataset

In [None]:
# Load the dataset from the local directory
local_dataset_dir = "dataset/raw_dataset"

# Save the dataset to the local directory
dataset.save_to_disk(local_dataset_dir)


## Save Processed Tokernizer

In [None]:
import os

local_dataset_tokenizer_dir = "dataset/tokenizers"
os.makedirs(local_dataset_tokenizer_dir, exist_ok=True)

# Save the preprocessed dataset to the "dataset" folder
tokenized_datasets.save_to_disk(local_dataset_tokenizer_dir)

## Load Processed Dataset

In [None]:
# Load the dataset from the local directory
dataset = dataset.load_from_disk("dataset/raw_dataset")

print("Dataset loaded from local directory")

## Load Processed Tokernizer

In [5]:
# Load the preprocessed dataset from the "dataset" folder
tokenized_datasets = datasets.load_from_disk("dataset/tokenizers")

# Training Process

## Define Hyperparameters

In [6]:
model_path = "viet-news-sum-mt5-small-finetune"
# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=model_path,
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=10, # 40 -> 50
    weight_decay=0.01,
    save_total_limit=1,
    save_steps=500,
    logging_steps=100,
    predict_with_generate=True,
    # fp16=True if torch.cuda.is_available() else False,
    report_to="none" # Disable wandb reporting
)



## Train

In [7]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0785,0.027335
2,0.074,0.023692
3,0.0662,0.019857
4,0.0608,0.015207
5,0.0539,0.012891
6,0.0526,0.010073
7,0.0464,0.009463
8,0.0461,0.007866
9,0.0446,0.00686
10,0.0523,0.006372


TrainOutput(global_step=55930, training_loss=0.056790818105374, metrics={'train_runtime': 30781.5169, 'train_samples_per_second': 21.801, 'train_steps_per_second': 1.817, 'total_flos': 3.548280550588416e+17, 'train_loss': 0.056790818105374, 'epoch': 10.0})

# Save Finetune Model

In [9]:
# Save the model and tokenizer
model_path = "viet-news-sum-mt5-small-finetune"

# Save the tokenizer
tokenizer.save_pretrained(model_path)

# Save the model
model.save_pretrained(model_path)

print(f"Model and tokenizer saved to {model_path}")

Model and tokenizer saved to ./viet-sum-mt5-small-finetune


# Inference Process

In [None]:
# Load the trained model and tokenizer
model_path = "viet-news-sum-mt5-small-finetune"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

def preprocess_input(text):
    inputs = tokenizer(text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    return inputs

# Define a function to generate the summary
def generate_summary(text):
    inputs = preprocess_input(text)
    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128, 
            early_stopping=True
        )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [None]:
input_text = """
V√†o ng√†y 8-1, khoa g√¢y m√™ h·ªìi s·ª©c B·ªánh vi·ªán ƒêa khoa ƒê·ª©c Giang ti·∫øp nh·∫≠n b·ªánh nh√¢n L.T.N.T. (23 tu·ªïi, Ch∆∞∆°ng M·ªπ, H√† N·ªôi) trong t√¨nh tr·∫°ng h√¥n m√™ sau tai n·∫°n giao th√¥ng.

Thai ph·ª• mang thai 26 tu·∫ßn b·ªã vi√™m ph·ªïi, ch·∫•n th∆∞∆°ng s·ªç n√£o nghi√™m tr·ªçng v·ªõi xu·∫•t huy·∫øt d∆∞·ªõi nh·ªán v√† t·ª• m√°u d∆∞·ªõi m√†ng c·ª©ng tr√°n ph·∫£i.

Theo b√°c sƒ© L√™ Nguy·ªÖn An - tr∆∞·ªüng khoa g√¢y m√™ h·ªìi s·ª©c B·ªánh vi·ªán ƒêa khoa ƒê·ª©c Giang, v·∫•n ƒë·ªÅ th√°ch th·ª©c trong qu√° tr√¨nh ƒëi·ªÅu tr·ªã v·ªõi b·ªánh nh√¢n n√†y l√† vi·ªác c·∫ßn ph·∫£i ƒë·∫£m b·∫£o s·ª©c kh·ªèe cho c·∫£ m·∫π v√† con l√† r·∫•t kh√≥ khƒÉn.

"C√°c b√°c sƒ© c·ªë g·∫Øng duy tr√¨ tu·ªïi thai ngo√†i 30 tu·∫ßn ƒë·ªÉ ƒë·∫£m b·∫£o vi·ªác khi sinh ra tr·∫ª c√≥ th·ªÉ ph√°t tri·ªÉn b√¨nh th∆∞·ªùng. Vi·ªác ƒë·∫£m b·∫£o an to√†n t√≠nh m·∫°ng cho m·∫π c≈©ng ph·∫£i c√¢n ƒë·ªëi ph√π h·ª£p, h·∫°n ch·∫ø t·ªëi thi·ªÉu vi·ªác ·∫£nh h∆∞·ªüng t·ªõi thai nhi", b√°c sƒ© An n√≥i.

Trong su·ªët qu√° tr√¨nh ƒëi·ªÅu tr·ªã, c√°c b√°c sƒ© li√™n t·ª•c ph·ªëi h·ª£p v·ªõi chuy√™n khoa s·∫£n v√† dinh d∆∞·ª°ng ƒë·ªÉ ƒë√°nh gi√° v√† ƒëi·ªÅu ch·ªânh li√™n t·ª•c cho ng∆∞·ªùi b·ªánh ƒë·ªÉ ƒë·∫£m b·∫£o s·ª± ph√°t tri·ªÉn c·ªßa em b√© trong b·ª•ng m·∫π.

ƒê·∫∑c bi·ªát vi·ªác chƒÉm s√≥c ng∆∞·ªùi b·ªánh ·ªü tr·∫°ng th√°i h√¥n m√™, th·ªü qua m·ªü kh√≠ qu·∫£n r·∫•t kh√≥ khƒÉn, nhi·ªÅu nguy c∆° r·ªßi ro v·ªÅ t√¨nh tr·∫°ng nhi·ªÖm khu·∫©n, thi·∫øu h·ª•t dinh d∆∞·ª°ng, lo√©t tr·ª£t ƒëi·ªÉm t√¨ ƒë√®, nguy c∆° suy thai".

Sau 70 ng√†y ƒëi·ªÅu tr·ªã, t√¨nh tr·∫°ng c·ªßa s·∫£n ph·ª• d·∫ßn ·ªïn ƒë·ªãnh. C√°c ch·ªâ s·ªë sinh t·ªìn c·∫£i thi·ªán, b·ªánh nh√¢n t·ª± th·ªü qua m·ªü kh√≠ qu·∫£n, thai ph√°t tri·ªÉn b√¨nh th∆∞·ªùng.

T·ªëi 15-3, s·∫£n ph·ª• c√≥ d·∫•u hi·ªáu chuy·ªÉn d·∫°, thai 36 tu·∫ßn (theo d·ª± ki·∫øn sinh), ng√¥i ng∆∞·ª£c, ·ªëi v·ª° s·ªõm. ƒê·ªôi ng≈© b√°c sƒ© quy·∫øt ƒë·ªãnh m·ªï l·∫•y thai.

Ca ph·∫´u thu·∫≠t th√†nh c√¥ng, m·ªôt b√© trai n·∫∑ng 2kg ch√†o ƒë·ªùi kh√≥c to, ni√™m m·∫°c h·ªìng h√†o trong ni·ªÅm h·∫°nh ph√∫c v√¥ b·ªù c·ªßa ƒë·ªôi ng≈© y b√°c sƒ© v√† gia ƒë√¨nh.

Ba ng√†y sau m·ªï, s·∫£n ph·ª• t·ªânh t√°o, t·ª± ƒÉn u·ªëng, ƒë∆∞·ª£c r√∫t m·ªü kh√≠ qu·∫£n. D·ª± ki·∫øn c·∫£ m·∫π v√† b√© xu·∫•t vi·ªán trong ng√†y 21-3.
"""

input_text = input_text.replace("\n", "")

summary = generate_summary(input_text)
print(f"Summary: {summary}")



Summary: ƒê·∫øn khoa g√¢y m√™ h·ªìi s·ª©c B·ªánh vi·ªán ƒê·ª©c Giang , H√† N·ªôi v·ª´a ti·∫øp nh·∫≠n m·ªôt n·ªØ thai ph·ª• mang thai 26 tu·∫ßn b·ªã vi√™m ph·ªïi , ch·∫•n th∆∞∆°ng s·ªç n√£o nghi√™m tr·ªçng v·ªõi xu·∫•t huy·∫øt d∆∞·ªõi nh·ªán .
