In [1]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
from transformers import (
    BartConfig,
    BartForConditionalGeneration,
    AdamW,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup
)
#from accelerate import Accelerator
import os
import math
import random
import numpy as np
from tqdm import tqdm
import time
import datetime
from data_process import text_preprocess, build_dataset
from config import config
import pickle
config = config()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [2]:
# accelerator = Accelerator(fp16 = True, cpu = False)
# device = accelerator.device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

def get_optim_shedu(named_parameters, total_steps, Hyparameters_config, use_scheduler=True):
    ignored_params = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    optimizer_parameters = [
        {
            "params": [p for n,p in named_parameters if not any(i in n for i in ignored_params)],
        },
        {
            "params": [p for n,p in named_parameters if any(i in n for i in ignored_params)],
            "weight_decay": 0.0
        }
    ]
    
    optimizer = AdamW(optimizer_parameters, lr=Hyparameters_config['lr'], weight_decay=Hyparameters_config['weight_decay'])
    # optimizer = optim.Adam(optimizer_parameters, lr=Hyparameters_config['lr'], weight_decay=Hyparameters_config['weight_decay'])
    if use_scheduler:
        scheduler = get_cosine_schedule_with_warmup(
            optimizer = optimizer,
            num_warmup_steps=0.2*total_steps,
            num_training_steps=total_steps
        )
#         scheduler = optim.lr_scheduler.ReduceLROnPlateau(
#             optimizer,
#             mode='max',
#             factor=Hyparameters_config['lr_gamma'],
#             patience=Hyparameters_config['patience']
#         )
        return optimizer,scheduler
    else:
        return optimizer
    
def trainer(model, train_dataset, valid_dataset, num_epochs, Hyparameters_config):
    train_iter = DataLoader(train_dataset, Hyparameters_config['batch_size'], shuffle=True)
    valid_iter = DataLoader(valid_dataset, Hyparameters_config['batch_size'], shuffle=False)
    total_steps = len(train_iter)*num_epochs
    
    optimizer,scheduler = get_optim_shedu(model.named_parameters(), total_steps, Hyparameters_config, use_scheduler=True)
    
    model = model.to(device)
    
    model.train()
    for epoch in range(num_epochs):
        epoch_loss_train, epoch_loss_eval, epoch_acc_train, epoch_acc_eval = 0, 0, 0, 0
        start_time = time.time()
        
        print('lr: ',optimizer.state_dict()['param_groups'][0]['lr'])
        for inputs in tqdm(train_iter, desc="training for epoch {}: ".format(epoch+1)):
            optimizer.zero_grad()
            
            inputs_1,inputs_2 = {},{}
            for name in inputs.keys():
                inputs_1[name],inputs_2[name] = torch.chunk(inputs[name].long(), chunks=2, dim=1)
                inputs_1[name] = inputs_1[name].contiguous().to(device)
                inputs_2[name] = inputs_2[name].contiguous().to(device)
                
            loss = torch.tensor([0.0], device=device)
            outputs = torch.tensor([])
            for inputs_ in [inputs_1,inputs_2]:
                outputs_dict = model(**inputs_) # outputs: [batch, seq_len, vocab_size]
                loss,outputs = loss+outputs_dict['loss'], torch.cat((outputs, outputs_dict['logits'].cpu()), dim=1)
            batch_size,seq_len = outputs.size(0), outputs.size(1)
            
            batch_acc_train = torch.tensor([0.0])
            for predicted, target in zip(outputs, inputs['labels'].cpu()):
                batch_acc_train += (target == predicted.argmax(dim=1)).sum().item()/seq_len
                
            #Before = list(model.parameters())[1].clone() # 获取更新前模型的第0层权重
            loss.backward()
            
            # 梯度截断
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=20, norm_type=2)

            # 参数更新
            optimizer.step()
            scheduler.step()
            
            epoch_loss_train += loss.item()
            epoch_acc_train += batch_acc_train.item()/batch_size
            #After = list(model.parameters())[1].clone()
            #print('encoder的第0层更新幅度：',torch.sum(After-Before))
            #print(epoch_loss_train,epoch_acc_train)
        
            del inputs, inputs_1, inputs_2, outputs_dict, loss, batch_acc_train
        
        # 参数保存
        if (epoch+1) % Hyparameters_config['save_state_epoch']==0:
            path = './save_models/'
            if not os.path.exists(path):
                os.makedirs(path)
            torch.save(
                {
                    'model_state_dict': model.state_dict(),
                },
                path + f'epoch_{epoch+1}.pkl'
            )
        
        # eval
        with torch.no_grad():
            for inputs in tqdm(valid_iter, desc="evaluating for epoch {}: ".format(epoch+1)):
                inputs_1,inputs_2 = {},{}
                for name in inputs.keys():
                    inputs_1[name],inputs_2[name] = torch.chunk(inputs[name].long(), chunks=2, dim=1)
                    inputs_1[name] = inputs_1[name].contiguous().to(device)
                    inputs_2[name] = inputs_2[name].contiguous().to(device)
                
                loss = torch.tensor([0.0])
                outputs = torch.tensor([])
                for inputs_ in [inputs_1,inputs_2]:
                    outputs_dict = model(**inputs_) # outputs: [batch, seq_len, vocab_size]
                    loss,outputs = loss+outputs_dict['loss'].item(), torch.cat((outputs, outputs_dict['logits'].cpu()), dim=1)
                batch_size,seq_len = outputs.size(0), outputs.size(1)
                
                batch_acc_eval = torch.tensor([0.0])
                for predicted,target in zip(outputs, inputs['labels'].cpu()):
                    batch_acc_eval += (target == predicted.argmax(dim=1)).sum().item()/seq_len
                
                epoch_loss_eval += loss.item()
                epoch_acc_eval += batch_acc_eval.item()/batch_size
        
        #scheduler.step(round(epoch_acc_eval/len(valid_iter),2))
        del inputs, inputs_1, inputs_2, outputs_dict, loss, batch_acc_eval
        torch.cuda.empty_cache()
        
        # 参数打印
        duration = str(datetime.timedelta(seconds=time.time() - start_time))[:7]
        print("Time: {} | Epoch: {}/{} | train_loss: {:.3} | train_acc: {:.3} | eval_loss: {:.3} | eval_acc: {:.3}".format(
            duration, epoch+1, num_epochs, epoch_loss_train/len(train_iter), epoch_acc_train/len(train_iter),
            epoch_loss_eval/len(valid_iter), epoch_acc_eval/len(valid_iter)))

device: cuda


In [3]:
try:
    with open(config.train_dataset, 'rb') as f:
        train_dataset = pickle.load(f)
    with open(config.valid_dataset, 'rb') as f:
        valid_dataset = pickle.load(f)  
except:
    # 文本清洗
    train_data = text_preprocess(config.train_path)
    valid_data = text_preprocess(config.valid_path)
    # 构建dataset
    train_dataset = build_dataset(train_data)
    valid_dataset = build_dataset(valid_data)
    with open(config.train_dataset, 'wb') as f:
        pickle.dump(train_dataset, f)
    with open(config.valid_dataset, 'wb') as f:
        pickle.dump(valid_dataset, f)

model = BartForConditionalGeneration.from_pretrained(config.model_name)

In [4]:
num_epochs = 8
Hyparameters_config = {
    'lr': 5e-6,
    'weight_decay': 2e-2,
    'batch_size': 8,
    'lr_gamma': 0.2,
    'patience': 2,
    'save_state_epoch': 1
}
#torch.cuda.empty_cache()
saved_model = torch.load("save_models/epoch_4.pkl", map_location=device)
model.load_state_dict(saved_model['model_state_dict'])
trainer(model, train_dataset, valid_dataset, num_epochs, Hyparameters_config)



lr:  0.0


training for epoch 1: 100%|██████████| 857/857 [10:36<00:00,  1.35it/s]
evaluating for epoch 1: 100%|██████████| 63/63 [00:39<00:00,  1.61it/s]


Time: 0:11:15 | Epoch: 1/8 | train_loss: 1.57 | train_acc: 0.801 | eval_loss: 3.01 | eval_acc: 0.66
lr:  3.125e-06


training for epoch 2: 100%|██████████| 857/857 [10:37<00:00,  1.35it/s]
evaluating for epoch 2: 100%|██████████| 63/63 [00:34<00:00,  1.80it/s]


Time: 0:11:14 | Epoch: 2/8 | train_loss: 1.55 | train_acc: 0.803 | eval_loss: 3.01 | eval_acc: 0.659
lr:  4.9519632010080765e-06


training for epoch 3: 100%|██████████| 857/857 [10:37<00:00,  1.34it/s]
evaluating for epoch 3: 100%|██████████| 63/63 [00:37<00:00,  1.66it/s]


Time: 0:11:15 | Epoch: 3/8 | train_loss: 1.53 | train_acc: 0.805 | eval_loss: 3.0 | eval_acc: 0.661
lr:  4.432526133406843e-06


training for epoch 4: 100%|██████████| 857/857 [10:28<00:00,  1.36it/s]
evaluating for epoch 4: 100%|██████████| 63/63 [00:36<00:00,  1.71it/s]


Time: 0:11:07 | Epoch: 4/8 | train_loss: 1.52 | train_acc: 0.806 | eval_loss: 3.01 | eval_acc: 0.66
lr:  3.4567085809127247e-06


training for epoch 5: 100%|██████████| 857/857 [10:25<00:00,  1.37it/s]
evaluating for epoch 5: 100%|██████████| 63/63 [00:34<00:00,  1.82it/s]


Time: 0:11:00 | Epoch: 5/8 | train_loss: 1.51 | train_acc: 0.807 | eval_loss: 3.01 | eval_acc: 0.66
lr:  2.2549571491760985e-06


training for epoch 6: 100%|██████████| 857/857 [10:37<00:00,  1.34it/s]
evaluating for epoch 6: 100%|██████████| 63/63 [00:37<00:00,  1.68it/s]


Time: 0:11:16 | Epoch: 6/8 | train_loss: 1.5 | train_acc: 0.808 | eval_loss: 3.02 | eval_acc: 0.66
lr:  1.1110744174509952e-06


training for epoch 7: 100%|██████████| 857/857 [10:37<00:00,  1.34it/s]
evaluating for epoch 7: 100%|██████████| 63/63 [00:35<00:00,  1.78it/s]


Time: 0:11:13 | Epoch: 7/8 | train_loss: 1.49 | train_acc: 0.809 | eval_loss: 3.02 | eval_acc: 0.66
lr:  2.9519683912911267e-07


training for epoch 8: 100%|██████████| 857/857 [10:36<00:00,  1.35it/s]
evaluating for epoch 8: 100%|██████████| 63/63 [00:36<00:00,  1.73it/s]


Time: 0:11:14 | Epoch: 8/8 | train_loss: 1.49 | train_acc: 0.809 | eval_loss: 3.01 | eval_acc: 0.661


In [None]:
text = "钛媒体9月23日消息，今日澳门面向内地居民旅游签注全面开放，携程数据显示澳门各类旅游产品搜索量从22日起开始暴增，最高涨幅500%。预计国庆期间，澳门或将迎来旅游小高峰。"
article_input_ids = tokenizer.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids']
summary_ids = model.generate(article_input_ids, num_beams=4, length_penalty=2.0, max_length=142, no_repeat_ngram_size=3)
summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
print(summary_txt)