In [1]:
import json
from torch.utils.data import Dataset, Subset
from typing import Union
import os

max_dataset_size = 200000  # 你定义的最大数据量（可选：用于调试或限制内存）

class LCSTS(Dataset):
    def __init__(self, data_file, start_idx=0, end_idx=None):
        """
        初始化数据集
        :param data_file: JSON 文件路径，包含 list of dict，每个 dict 有 'title' 和 'content'
        :param start_idx: 起始索引（用于划分数据集）
        :param end_idx: 结束索引
        """
        self.data = self.load_data(data_file, start_idx, end_idx)
    
    def load_data(self, data_file, start_idx, end_idx):
        """
        加载数据并按索引切片
        """
        # 读取 JSON 文件
        if not os.path.exists(data_file):
            raise FileNotFoundError(f"{data_file} not found.")
        
        with open(data_file, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)  # 假设是 list of dict

        # 确保是列表格式
        if isinstance(raw_data, dict):
            # 有些数据可能包裹在 {'data': [...]} 中
            raw_data = raw_data.get('data', []) if 'data' in raw_data else []
        
        # 截取指定范围
        if end_idx is None:
            end_idx = len(raw_data)
        
        # 应用全局最大限制（可选）
        start_idx = min(start_idx, max_dataset_size)
        end_idx = min(end_idx, max_dataset_size, len(raw_data))
        
        # 切片
        selected_data = raw_data[start_idx:end_idx]
        
        # 转为列表，确保索引访问
        return selected_data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


In [3]:
# 假设你的 JSON 文件路径
data_file = 'data/lcsts_data.json'

# 定义划分范围
train_dataset = LCSTS(data_file, start_idx=0,      end_idx=2000)
valid_dataset = LCSTS(data_file, start_idx=2000,  end_idx=3000)
test_dataset  = LCSTS(data_file, start_idx=3000,  end_idx=4000)  # 注意：你写的是 300000，可能是笔误

# 打印信息
print(f'Training set size: {len(train_dataset)}')   # 20000
print(f'Validation set size: {len(valid_dataset)}') # 10000
print(f'Testing set size: {len(test_dataset)}')     # 10000

# 示例：查看第一条训练数据
print("Sample from train set:")
print(next(iter(train_dataset)))

Training set size: 2000
Validation set size: 1000
Testing set size: 1000
Sample from train set:
{'title': '修改后的立法法全文公布', 'content': '新华社受权于18日全文播发修改后的《中华人民共和国立法法》，修改后的立法法分为“总则”“法律”“行政法规”“地方性法规、自治条例和单行条例、规章”“适用与备案审查”“附则”等6章，共计105条。'}


In [8]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer


max_input_length = 512
max_target_length = 64
model_checkpoint = "csebuetnlp/mT5_multilingual_XLSum"

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained("./models/")
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained("./models/", use_fast=False)

def collote_fn(batch_samples):
    batch_inputs, batch_targets = [], []
    for sample in batch_samples:
        batch_inputs.append(sample['content'])
        batch_targets.append(sample['title'])
    batch_data = tokenizer(
        batch_inputs, 
        padding=True, 
        max_length=max_input_length,
        truncation=True, 
        return_tensors="pt"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch_targets, 
            padding=True, 
            max_length=max_target_length,
            truncation=True, 
            return_tensors="pt"
        )["input_ids"]
        batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)
        end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
        for idx, end_idx in enumerate(end_token_index):
            labels[idx][end_idx+1:] = -100
        batch_data['labels'] = labels
    return batch_data


Using cuda device


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=4, shuffle=False, collate_fn=collote_fn)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collote_fn)

batch = next(iter(train_dataloader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch)


KeysView({'input_ids': tensor([[   381,    891,   2116,    838,    261,  76581,   1223,  19185,   5510,
         238814,  23418,  36444,  17933,  14841,   2619, 201521,   2037,  76581,
           1223,  19185,   5510, 238814,  23418, 148577,  19347,    493,  78727,
         124974, 184310,  51838,    306,  17933,  14841,   2619, 127646, 161296,
          42964,   2445,  33119,   1083,   5028,  24659, 127974,   1644,  17955,
          54093,   3582,   5435,    261,   5144,   2991,  17933,  14841,   2619,
           5705,   3480,  30765,   1543,  97806, 146315,  23818,    493, 128334,
            306,      1,      0,      0,      0,      0,      0,      0],
        [   259,  22014, 215154,    261,  76581,   4938,   3802,  32738,   5705,
          24673,   2372,  26979,   5510, 238814,   3017,  17823, 184310,    261,
           2037,    591,  76581,   8893,   8369, 102873,  24659,    365, 134986,
          49089,  15104,  24673,  42357,   1107,   4462, 177996,    261,   8349,
           2



In [10]:

from tqdm.auto import tqdm

def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(dataloader)
    
    model.train()
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = batch_data.to(device)
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    return total_loss


In [11]:

import numpy as np
from rouge import Rouge

rouge = Rouge()

def test_loop(dataloader, model):
    preds, labels = [], []
    
    model.eval()
    for batch_data in tqdm(dataloader):
        batch_data = batch_data.to(device)
        with torch.no_grad():
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_target_length,
                num_beams=4,
                no_repeat_ngram_size=2,
            ).cpu().numpy()
        if isinstance(generated_tokens, tuple):
            generated_tokens = generated_tokens[0]
        label_tokens = batch_data["labels"].cpu().numpy()

        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        preds += [' '.join(pred.strip()) for pred in decoded_preds]
        labels += [' '.join(label.strip()) for label in decoded_labels]
    scores = rouge.get_scores(hyps=preds, refs=labels, avg=True)
    result = {key: value['f'] * 100 for key, value in scores.items()}
    result['avg'] = np.mean(list(result.values()))
    print(f"Rouge1: {result['rouge-1']:>0.2f} Rouge2: {result['rouge-2']:>0.2f} RougeL: {result['rouge-l']:>0.2f}\n")
    return result


In [12]:
# 评估下基准性能
test_loop(test_dataloader, model)

100%|██████████| 250/250 [03:37<00:00,  1.15it/s]

Rouge1: 26.07 Rouge2: 13.80 RougeL: 23.28






{'rouge-1': 26.06907931080389,
 'rouge-2': 13.800633335270968,
 'rouge-l': 23.278877761486054,
 'avg': np.float64(21.04953013585364)}

In [None]:

from torch.optim import AdamW
from transformers import get_scheduler

learning_rate = 2e-5
epoch_num = 2

optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

total_loss = 0.
best_avg_rouge = 0.
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
    valid_rouge = test_loop(valid_dataloader, model)
    print(valid_rouge)
    rouge_avg = valid_rouge['avg']
    if rouge_avg > best_avg_rouge:
        best_avg_rouge = rouge_avg
        print('saving new weights...\n')
        torch.save(model.state_dict(), f'epoch_{t+1}_valid_rouge_{rouge_avg:0.4f}_model_weights.bin')
print("Done!")



Epoch 1/2
-------------------------------


loss: 0.000000:   0%|          | 0/500 [00:00<?, ?it/s]