In [1]:
# 基于标注结果，训练任务模型

In [None]:
import os, json
from modelscope.msdatasets import MsDataset
from modelscope.trainers import build_trainer

import warnings
warnings.filterwarnings('ignore')

In [2]:
MODEL_NAME = 'paddle_nlp' # [chatglm2-6b, qwen-7b-chat, siamese_uninlu, paddle_nlp, ensemble_MajorityVote, ensemble_LabelModel]
DATASET = 'tnews'
TRAIN_TYPE = 1 # 1 - 训练和验证都用自动标注结果；2 - 训练用自动标注，验证用clean数据；3 - 训练和验证都用clean数据

In [None]:
# 读取train、val、test

if TRAIN_TYPE == 1:
    train = MsDataset.load(f'exp_dataset/{DATASET}/result_{MODEL_NAME}_{DATASET}_train.json')
    val = MsDataset.load(f'exp_dataset/{DATASET}/result_{MODEL_NAME}_{DATASET}_val.json')
elif TRAIN_TYPE == 2:
    train = MsDataset.load(f'exp_dataset/{DATASET}/result_{MODEL_NAME}_{DATASET}_train.json')
    val = MsDataset.load(f'exp_dataset/{DATASET}/val.json')
elif TRAIN_TYPE == 3:
    train = MsDataset.load(f'exp_dataset/{DATASET}/train.json')
    val = MsDataset.load(f'exp_dataset/{DATASET}/val.json')
    
test = MsDataset.load(f'exp_dataset/{DATASET}/test.json')
print('数据集加载完成')


In [None]:
# 训练模型

# 训练参数
max_epoch = 20
optimizer_name = 'Adam'

batch_size = 32
lr_list = [1e-5, 2e-5, 3e-5, 4e-5, 5e-5]

for lr_index in [0, 1, 2, 3, 4]:
    lr = lr_list[lr_index]

    # 文件路径
    if TRAIN_TYPE == 3:
        # 与标注器无关
        model_path = f'/mnt/workspace/saved_model/{DATASET}/train_type{TRAIN_TYPE}_{lr}_{batch_size}/'
    else:
        model_path = f'/mnt/workspace/saved_model/{DATASET}/{MODEL_NAME}/train_type{TRAIN_TYPE}_{lr}_{batch_size}/'

    tensorboard_path = model_path + 'tensorboard_output/'

    def cfg_modify_fn_tnews(cfg):
        cfg.task = 'text-classification'
        cfg.pipeline = {'type': 'text-classification'}
        cfg.preprocessor = {
            'train': {
                # 配置预处理器名字
                'type': 'sen-cls-tokenizer',
                # 配置句子1的key
                'first_sequence': 'sentence',
                # 配置label
                'label': 'label',
                # 配置mode
                'mode': 'train',
            },
            'val': {
                # 配置预处理器名字
                'type': 'sen-cls-tokenizer',
                # 配置句子1的key
                'first_sequence': 'sentence',
                # 配置label
                'label': 'label',
                'mode': 'eval',
            }
        }

        cfg.model['num_labels'] = 15

        cfg['train'] = {
            "work_dir": model_path,
            "max_epochs": max_epoch,
            "dataloader": {
                # batch_size
                "batch_size_per_gpu": 32,
                "workers_per_gpu": 0
            },
            "optimizer": {
                # optimizer信息
                "type": optimizer_name,
                "lr": lr
            },
            "lr_scheduler": {
                "type": "StepLR",
                "step_size": max_epoch + 1 
            },
            "hooks": [{
                "type": "EvaluationHook",
                "interval": 1,
                "by_epoch": True,
            }, {
                "out_dir": tensorboard_path,
                "type": "TensorboardHook",
                "interval": 10
            }, {
                "type": "BestCkptSaverHook",
                "interval": 1,
                "by_epoch": True,
                "metric_key": "accuracy",
                "rule": "max",
                "max_checkpoint_num":1
            }, {
                "type": "TextLoggerHook",
                "interval": 10
            }]
        }
        cfg['evaluation'] = {
            "dataloader": {
                # batch_size
                "batch_size_per_gpu": 32,
                "workers_per_gpu": 0,
                "shuffle": False
            },
            "metrics": [{
                "type": "seq-cls-metric",
                "label_name": "labels",
                "logit_name": "logits",
            }]
        }
        return cfg


    kwargs = dict(
        model='damo/nlp_structbert_backbone_base_std',
        train_dataset=train,
        eval_dataset=val,
        cfg_modify_fn=cfg_modify_fn_tnews)

    os.environ['LOCAL_RANK'] = '0'
    trainer = build_trainer(name='trainer', default_args=kwargs)
    trainer.train()
    
    # 在测试集上跑结果
    kwargs = dict(
            model=model_path+'output_best/',
            eval_dataset=test)
    trainer = build_trainer(default_args=kwargs)
    eval_res = trainer.evaluate()
    print('test_result', eval_res)