In [12]:
import pandas as pd
import os
from tqdm import tqdm
import openpyxl

import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments 
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split

### bart

In [22]:
# data1 = pd.read_excel('data_summerize (1).xlsx')
# data2 = pd.read_excel('data_summerize.xlsx') 

# combined_data = pd.concat([data1, data2], ignore_index=True)
# combined_data.dropna(inplace=True)
# combined_data['summarization'] = combined_data['summarization'].str.lower()

# features = combined_data['content']
# labels = combined_data['summarization']

# train_content, test_content, train_summary, test_summary = train_test_split(features, labels, test_size=0.2, random_state=42)
# val_content, test_content, val_summary, test_summary = train_test_split(test_content, test_summary, test_size=0.5, random_state=42)

# train_df = pd.DataFrame({'document': train_content, 'summary': train_summary})
# val_df = pd.DataFrame({'document': val_content, 'summary': val_summary})
# test_df = pd.DataFrame({'document': test_content, 'summary': test_summary})

# train_df.to_csv('data/train.csv', index=False)
# val_df.to_csv('data/val.csv', index=False)
# test_df.to_csv('data/test.csv', index=False)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")
model = AutoModelForSeq2SeqLM.from_pretrained("vinai/bartpho-syllable")

In [None]:
data_files = {
    'train': 'data/train.csv',
    'validation': 'data/val.csv',
    'test': 'data/test.csv'
}
dataset = load_dataset("csv", data_files=data_files)

In [None]:
def tokenize(batch):
    inputs = tokenizer(batch['document'], padding="max_length", max_length=1024, truncation=True)
    labels = tokenizer(batch['summary'], padding="max_length", max_length=1024, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs
tokenized_data = dataset.map(tokenize, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")


In [None]:
def evaluate(preds, targets):
    rouge = load_metric("rouge")
    scores = rouge.compute(predictions=preds, references=targets)
    return {
        "rouge1": scores["rouge1"].mid.fmeasure,
        "rouge2": scores["rouge2"].mid.fmeasure,
        "rougeL": scores["rougeL"].mid.fmeasure
    }

def metrics_func(eval_arg):
    preds, labels = eval_arg
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return evaluate(decoded_preds, decoded_labels)

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["document"], max_length=512, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=128, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./financial_summarization",
    run_name="financial_summarization_run",
    num_train_epochs=3,
    learning_rate=3e-5,
    per_device_train_batch_size=12, 
    per_device_eval_batch_size=12,  
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    predict_with_generate=True,
    generation_max_length=1024,
    fp16=True,
    gradient_accumulation_steps=2, 
    save_total_limit=3,             
    save_steps=500,                 
    logging_steps=100              
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metrics_func
)

In [None]:
trainer.train()

model.save_pretrained("bartpho")
tokenizer.save_pretrained("bartpho")