In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd

In [4]:
# Bước 1: Tên mô hình
model_name = "VietAI/vit5-base-vietnews-summarization"

# Bước 2: Tải tokenizer và mô hình
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
# Bước 3: Load dữ liệu CSV
dataset = load_dataset("csv", data_files="./data/data_2000.csv")

# Bước 4: Tách train/test
dataset = dataset["train"].train_test_split(test_size=0.2)

# Bước 5: Hàm tiền xử lý
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["Contents"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    labels = tokenizer(examples["Summary"], max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
# Bước 6: Áp dụng tiền xử lý
tokenized_dataset = dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="./vit5-summary-output",
    per_device_train_batch_size=1,        # Giảm batch size
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    do_train=True,
    do_eval=True,
    no_cuda=True                          # DÙNG CPU
)


# Bước 8: Huấn luyện
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)

trainer.train()

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.7034
1000,0.4424
1500,0.4177
2000,0.2832
2500,0.2628
3000,0.2614
3500,0.2029
4000,0.1461
4500,0.1512


TrainOutput(global_step=4800, training_loss=0.30859177907307944, metrics={'train_runtime': 39409.6822, 'train_samples_per_second': 0.122, 'train_steps_per_second': 0.122, 'total_flos': 5845995749376000.0, 'train_loss': 0.30859177907307944, 'epoch': 3.0})

In [8]:
# Bước 9: Lưu mô hình
model.save_pretrained("./model-summary")
tokenizer.save_pretrained("./model-summary")

('./model-summary\\tokenizer_config.json',
 './model-summary\\special_tokens_map.json',
 './model-summary\\spiece.model',
 './model-summary\\added_tokens.json')

In [None]:
from huggingface_hub import login

# login(token="")
# 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

In [12]:
from huggingface_hub import whoami

print(whoami()['name'])  # Trả về username của bạn


duonggbill


In [17]:
from huggingface_hub import HfApi

api = HfApi()

api.create_repo(repo_id="duonggbill/dbill-model-summary", private=False)


RepoUrl('https://huggingface.co/duonggbill/dbill-model-summary', endpoint='https://huggingface.co', repo_type='model', repo_id='duonggbill/dbill-model-summary')

In [18]:
from huggingface_hub import upload_folder

upload_folder(
    folder_path="./model-summary",           # Thư mục chứa model và tokenizer
    path_in_repo="",                         # Gốc repo
    repo_id="duonggbill/dbill-model-summary",   # Repo bạn đã tạo hoặc muốn tạo
    repo_type="model"                        # Kiểu repo
)


model.safetensors:   0%|          | 0.00/904M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/duonggbill/dbill-model-summary/commit/dac8cbb66d33dfc56d659eaad16a2dea05df8803', commit_message='Upload folder using huggingface_hub', commit_description='', oid='dac8cbb66d33dfc56d659eaad16a2dea05df8803', pr_url=None, repo_url=RepoUrl('https://huggingface.co/duonggbill/dbill-model-summary', endpoint='https://huggingface.co', repo_type='model', repo_id='duonggbill/dbill-model-summary'), pr_revision=None, pr_num=None)

In [15]:
git --version



NameError: name 'git' is not defined