# 导入依赖

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import torch
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

import logging

log = logging.getLogger()
log.setLevel(logging.INFO)

# 初始化配置文件

In [2]:
class Config():
    train_data = 'data/train_dataset.npy' # 训练集
    predict_data = 'data/eval_dataset.npy' # 测试集
    result_data_save = 'result/submission.csv' # 预测结果
    device = 'cpu' # 训练驱动

    model_path = 'hfl/chinese-roberta-wwm-ext' # 预训练模型
    model_save_path = 'result/model' # 保存模型
    
    tokenizer = None # 预训练模型的tokenizer
    
    # 数据标签
    label_dict = {'晨会早报': 0, '宏观研报': 1, '策略研报': 2, '行业研报': 3, '公司研报': 4, '基金研报': 5, '债券研报': 6, '金融工程': 7, '其他研报': 8, '个股研报': 9}
    num_labels = len(label_dict) # 标签数量
    
    max_seq_len = 128 # 最大句子长度
    test_size = 0.15 # 校验集大小
    random_seed = 42 # 随机种子
    batch_size = 64 # 训练数据批大小
    val_batch_size = 8 # 校验/预测批大小
    epochs = 10 # 训练次数
    learning_rate = 1e-5 # 学习率
    l2_weight_decay = 0.05
    
    print_log = 20 # 日志打印步骤

config = Config()
config.device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 自定义dataset

In [3]:
# 自定义dataset
class MyDataset(Dataset):
    def __init__(self, config: Config, data: list, label: list = None):
        self.data = data
        self.tokenizer = config.tokenizer 
        self.max_seq_len = config.max_seq_len
        self.len = len(data)
        self.label = label

    def __getitem__(self, idx):
        text = self.data[idx]
        # tokenizer
        inputs = self.tokenizer.encode_plus(text, return_token_type_ids=True, return_attention_mask=True,
                                            max_length=self.max_seq_len, padding='max_length', truncation=True)

        # 打包预处理结果
        result = {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
                  'token_type_ids': torch.tensor(inputs['token_type_ids'], dtype=torch.long),
                  'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long)}
        if self.label is not None:
            result['labels'] = torch.tensor([self.label[idx]], dtype=torch.long)
        # 返回
        return result

    def __len__(self):
        return self.len

    

# 加载数据

In [3]:
train_data = pd.DataFrame(list(np.load(config.train_data, allow_pickle=True)))
train_data.head(5)

Unnamed: 0,label,header,title,paragraph,footer
0,0,仅供内部参考，请勿外传,"证券研究报告 | 浙商早知道,报告日期：2022 年 11 月 09 日,重要点评",❑ 【浙商互联网 谢晨】快手（01024.HK）Q3 业绩前瞻：竞争趋缓盈利改善，Q4 或...,http://www.stocke.com.cn 1/5 请务必阅读正文之后的免责条款部分
1,0,仅供内部参考，请勿外传,"中信证券研, 2023 年海外宏观与大类资产配置展望—东边日出西边雨 ,▍ 美股科技板块 2...","部 重点推荐 ,▍,崔嵘 海外宏观经济首席分析师 ,S1010517040001,宏观经济｜...",证券研究报告 具体分析内容（包括相关风险提示等）详见后文， 请务必阅读正文之后第 19 页...
2,0,仅供内部参考，请勿外传,"2022 年 11 月 10 日星期四,晨会纪要,【今日焦点】,以注册制为抓手提高直融比重...",党的二十大报告提出，“健全资本市场功能，提高直接融资比重。”新形势下，提高直接融资比重是资本...,请务必阅读最后一页股票评级说明和免责声明
3,0,仅供内部参考，请勿外传证券研究报告,"东吴证券晨会纪要 ,东吴证券晨会纪要 2022-11-11 [Table_Tag] ,宏观...",晨会编辑 曾朵红执业证书：S0600516080001021-60199793zengdh...,请务必阅读正文之后的免责声明部分
4,0,仅供内部参考，请勿外传晨会纪要（2022/11/16）,"渤海证券研究所晨会, 宏观及策略分析, 行业专题评述, 金融工程研究",崔健 022-28451618 SACNO: S1150511010016 cuijia...,请务必阅读正文之后的声明 渤海证券股份有限公司具备证券投资咨询业务资格 ...


# 加载预训练模型

In [5]:
tokenizer = AutoTokenizer.from_pretrained(config.model_path)
model = AutoModelForSequenceClassification.from_pretrained(config.model_path, num_labels=config.num_labels)

config.tokenizer = tokenizer

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model che

# 创建dataloader

In [6]:
# 拼接生成最终的文本
train_data['text'] = train_data['header'] + '[SEP]' + train_data['title'] + '[SEP]' + train_data['paragraph'] + '[SEP]' + train_data['footer']
# 切分数据
X_train, X_val, y_train, y_val = train_test_split(train_data['text'].tolist(), train_data['label'].tolist(),
                                                          test_size=config.test_size,
                                                          random_state=config.random_seed)
# 构建数据
train_dataloader = DataLoader(MyDataset(config, X_train, y_train), batch_size=config.batch_size, shuffle=True)
val_dataloader = DataLoader(MyDataset(config, X_val, y_val), batch_size=config.val_batch_size, shuffle=True)

# 训练模型

In [7]:
# 校验方法
def val(model, val_dataloader: DataLoader):
    model.eval()
    total_acc, total_f1, total_loss, test_num_batch = 0., 0., 0., 0
    for iter_id, batch in enumerate(val_dataloader):
        # 转GPU
        batch_cuda = {item: value.to(config.device) for item, value in batch.items()}
        # 模型计算
        output = model(**batch_cuda)
        # 获取结果
        loss = output[0]
        logits = torch.argmax(output[1], dim=1)

        y_pred = [[i] for i in logits.cpu().detach().numpy()]
        y_true = batch_cuda['labels'].cpu().detach().numpy()
        # 计算指标
        acc = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='weighted')  
        total_loss += loss.item()
        total_acc += acc
        total_f1 += f1
        test_num_batch += 1

    return total_loss/test_num_batch, total_acc/test_num_batch, total_f1/test_num_batch

# 训练方法
def train(model, config: Config, train_dataloader: DataLoader, val_dataloader: DataLoader):
    # 模型写入GPU
    model.to(config.device)

    # 获取BERT模型的所有可训练参数
    params = list(model.named_parameters())
    # 对除了bias和LayerNorm层的所有参数应用L2正则化
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in params if not any(nd in n for nd in no_decay)],
         'weight_decay': config.l2_weight_decay},
        {'params': [p for n, p in params if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0}
    ]
    # 创建优化器并使用正则化更新模型参数
    opt = torch.optim.AdamW(optimizer_grouped_parameters, lr=config.learning_rate)
    # 梯度衰减
    scheduler = optim.lr_scheduler.CosineAnnealingLR(opt, len(train_dataloader) * config.epochs)
    
    # 遍历训练
    best_f1 = 0
    for epoch in range(config.epochs):
        total_acc, total_f1, total_loss, train_num_batch = 0., 0., 0., 0
        model.train()
        zero_step = 0
        for iter_id, batch in enumerate(train_dataloader):
            # 数据写入GPU
            batch_cuda = {item: value.to(config.device) for item, value in batch.items()}
            # 模型计算
            output = model(**batch_cuda)
            # 获取结果
            loss = output[0]
            logits = torch.argmax(output[1], dim=1)

            y_pred = [[i] for i in logits.cpu().detach().numpy()]
            y_true = batch_cuda['labels'].cpu().detach().numpy()

            # 计算指标
            acc = accuracy_score(y_true, y_pred)
            f1 = f1_score(y_true, y_pred, average='weighted')  
            total_loss += loss.item()
            total_acc += acc
            total_f1 += f1

            # 反向传播，更新参数
            opt.zero_grad() 
            loss.backward()
            opt.step()
            scheduler.step()

            # 打印
            if iter_id % config.print_log == 0:
                logging.info('epoch:{}, iter_id:{}, loss:{}, acc:{}, f1:{}'.format(epoch, iter_id, loss.item(), acc, f1))
                
            train_num_batch += 1
        # 校验操作
        val_loss, val_acc, val_f1 = val(model, val_dataloader)
        if val_f1 > best_f1:
            best_f1 = val_f1
            # 保存best模型
            config.tokenizer.save_pretrained(config.model_save_path + "/best")
            model.save_pretrained(config.model_save_path + "/best")
        logging.info('-' * 15+str(epoch)+'-' * 15)
        logging.info('avg_train_loss:{}, avg_train_acc:{}, avg_train_f1:{}'.format(total_loss/train_num_batch, total_acc/train_num_batch, total_f1/train_num_batch))
        logging.info('val_loss:{}, val_acc:{}, val_acc:{}, best_f1:{}'.format(val_loss, val_acc, val_f1, best_f1))
       
        logging.info('-' * 30)
        
    # 保存最终模型
    config.tokenizer.save_pretrained(config.model_save_path)
    model.save_pretrained(config.model_save_path)

# 开始训练
train(model, config, train_dataloader, val_dataloader)
print('train done.')

INFO:root:epoch:0, iter_id:0, loss:2.2925097942352295, acc:0.125, f1:0.12116228070175439
INFO:root:epoch:0, iter_id:20, loss:1.6396788358688354, acc:0.3125, f1:0.2135964912280702
INFO:root:epoch:0, iter_id:40, loss:1.0177865028381348, acc:0.734375, f1:0.65247042326546
INFO:root:---------------0---------------
INFO:root:avg_train_loss:1.5650783805620103, avg_train_acc:0.4922388136288999, avg_train_f1:0.4080102334879386
INFO:root:val_loss:1.0458162820945351, val_acc:0.6788135593220338, val_acc:0.6313647981020862, best_f1:0.6313647981020862
INFO:root:------------------------------
INFO:root:epoch:1, iter_id:0, loss:1.0913031101226807, acc:0.734375, f1:0.6735634157509158
INFO:root:epoch:1, iter_id:20, loss:0.9109288454055786, acc:0.765625, f1:0.7347834967320261
INFO:root:epoch:1, iter_id:40, loss:0.5402162075042725, acc:0.859375, f1:0.8479111615904069
INFO:root:---------------1---------------
INFO:root:avg_train_loss:0.7638342607588995, avg_train_acc:0.8119612068965518, avg_train_f1:0.7883

train done.


# 预测模型

In [4]:
# 预测方法
def predict(config:Config):
    # 加载模型
    config.tokenizer = AutoTokenizer.from_pretrained(config.model_save_path)
    model = AutoModelForSequenceClassification.from_pretrained(config.model_save_path)
    model.to(config.device)
    model.eval()
    # 加载数据
    test_data = pd.DataFrame(list(np.load(config.predict_data, allow_pickle=True)))
    test_data['text'] = test_data['header'] + '[SEP]' + test_data['title'] + '[SEP]' + test_data['paragraph'] + '[SEP]' + test_data['footer']
    # 加载dataloader
    predict_dataloader = DataLoader(MyDataset(config, test_data['text'].tolist()), batch_size=config.val_batch_size, shuffle=False)
    
    predict_result = []
    predict_softmax = []
    softmax = None
    # 遍历预测
    for iter_id, batch in enumerate(predict_dataloader):
        batch_cuda = {item: value.to(config.device) for item, value in batch.items()}
        # 模型计算
        output = model(**batch_cuda)
        # 获取结果
        logits = torch.argmax(output[0], dim=1)
        y_pred = [[i] for i in logits.cpu().detach().numpy()]
        # 获取softmax
        y_softmax = [i for i in F.softmax(output.logits, dim=1).cpu().detach().numpy()]
        # 统计结果
        predict_result += y_pred
        predict_softmax += y_softmax
    # 输出结果
    test_data['label'] = [i[0] for i in predict_result]
    # 保存文件
    test_data[['uid', 'label']].to_csv(config.result_data_save, index=False, encoding='utf-8')

predict(config)
print('predict done.')

predict done.
