## 1.服务器环境配置

### 安装需要的软件包

In [1]:
% pip install transformers datasets tensorboard

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 52.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 35.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.0 MB/s 
Collecting responses<0.19
 

### 查看服务器 GPU 等硬件信息


In [1]:
import torch

torch.cuda.get_device_name()

AssertionError: Torch not compiled with CUDA enabled

## 2.缓存云盘上的模型和数据集到服务器


In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import shutil
import zipfile

# 云盘的路径
Cloud_Data_Path = '/content/drive/MyDrive/DataSets/'
Data_Set_Name = 'rpt_report_price.zip'  # 数据集
Cloud_Model_Path = '/content/drive/MyDrive/Models/pytorchmodels/FinBERT_L-12_H-768_A-12/'  # 模型

# 服务器的路径
Local_Data_Path = '/content/LocalDataSets/'
Local_Model_Path = '/content/LocalModels/'


# 解压缩
def unzip(file_path, tar_path):
    # 压缩文件判断
    if os.path.splitext(file_path)[-1] == '.zip':

        zFile = zipfile.ZipFile(file_path, "r")
        for files in zFile.namelist():
            zFile.extract(files, tar_path)

        zFile.close()


# 下载到服务器
def down_data():
    # 列出云盘所有数据集
    DataSetList = os.listdir(Cloud_Data_Path)
    print('DataSetList:', DataSetList)

    # 下载数据集
    if Data_Set_Name in DataSetList:

        # 创建本地数据集文件夹
        if not os.path.exists(Local_Data_Path):
            os.makedirs(Local_Data_Path)

        # 复制到本地
        shutil.copy(Cloud_Data_Path + Data_Set_Name,
                    Local_Data_Path + Data_Set_Name)

        # 如果是压缩文件
        unzip(Local_Data_Path + Data_Set_Name, Local_Data_Path)

    # 下载模型
    try:
        shutil.copytree(
            Cloud_Model_Path,
            Local_Model_Path + str(Cloud_Model_Path).split('/')[-2])
    except FileExistsError as e:
        pass


down_data()

DataSetList: ['flowers17_tsycnh.zip', 'CCF_2019.zip', 'rpt_report_price.zip']


## 3.在预训练模型上进行微调


### 可视化训练过程
用tensorboard记录日志文件

In [5]:
#  可视化
# logs_base_dir = '/content/gdrive/MyDrive/Saved_Models/FinBERT_L-12_H-768_A-12/runs/Jul22_13-49-58_1f15589edb2a/'
# %load_ext tensorboard
# %tensorboard --logdir={logs_base_dir}

### 微调

In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import numpy as np


# 新建一个class
class TrainBert:
    def __init__(self):
        #-------------------- 参数区 --------------------#
        self.CLASS_TAR = 1  #分类
        self.SENTENCE_MAX = 25
        self.CONTENT_COLUMN = 'title'  #BERT用于训练的文字的列名
        self.TAG_COLUMN = 'TAG_-1_1'  #标签所在的列名
        self.ALL_STEP = 100
        self.BATCH_SIZE = 512
        self.LEARN_RATE = 1e-2

        self.LEARN_RATE_LIST = [1e-2, 1e-3, 1e-4]
        self.BATCH_SIZE_LIST = [64, 128, 256, 512]

        #-------------------- 模型和数据集路径 --------------------#
        model_path = '/content/LocalModels/FinBERT_L-12_H-768_A-12/'
        local_root_path = '/content/LocalDataSets/'
        dateset_name = 'rpt_report_price'
        dataset_path = local_root_path + dateset_name + '/'
        saved_path = '/content/drive/MyDrive/Saved_Models/FinBERT_L-12_H-768_A-12/' + dateset_name + '/' + TAG_COLUMN + '/'

    #     用于训练的方法
    def start_train(self):
        pass

    #网格法调整超参数
    def save_settings(self):
        pass


#-------------------- 参数区 --------------------#
CLASS_TAR = 2  #分类
SENTENCE_LEN = 25  #句子长度
CONTENT_COLUMN = 'title'  #BERT用于训练的文字的列名
TAG_COLUMN = 'TAG_-1_1'  #标签所在的列名
ALL_STEP = 100
BATCH_SIZE = 512
LEARN_RATE = 1e-2
LEARN_RATE_LIST = [1e-2, 1e-3, 1e-4]
BATCH_SIZE_LIST = [64, 128, 256, 512]

#-------------------- 模型和数据集路径 --------------------#
model_path = '/content/LocalModels/FinBERT_L-12_H-768_A-12/'
local_root_path = '/content/LocalDataSets/'
dateset_name = 'rpt_report_price'
dataset_path = local_root_path + dateset_name + '/'
saved_path = '/content/drive/MyDrive/Saved_Models/FinBERT_L-12_H-768_A-12/' + dateset_name + '/' + TAG_COLUMN + '/'

if not os.path.exists(saved_path):
    os.makedirs(saved_path)


# 读取数据

def read_data(base_url):
    return load_dataset('csv',
                        data_files={
                            'train': base_url + 'train.csv',
                            'test': base_url + 'test.csv',
                            'dev': base_url + 'dev.csv'
                        })


# 编码训练集
def tokenize_data(tokenizer: BertTokenizer, word_length: int = SENTENCE_LEN):
    # 加载数据集
    raw_datasets = read_data(dataset_path)

    # 向量化函数
    def tokenize_function(dataset):
        return tokenizer(dataset[CONTENT_COLUMN],
                         truncation=True,
                         padding='max_length',
                         max_length=word_length)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

    # 重命名列
    tokenized_datasets = tokenized_datasets.rename_column(TAG_COLUMN, 'labels')

    return tokenized_datasets


#-------------------- 模型训练 --------------------#
def train_model(bert_path, class_num: int = CLASS_TAR):
    # 获取预训练的编码器和模型
    tokenizer = BertTokenizer.from_pretrained(bert_path)
    model = BertForSequenceClassification.from_pretrained(bert_path,
                                                          num_labels=class_num)

    # 获得向量化后的数据
    tokenized_datasets = tokenize_data(tokenizer)

    # 定义评价指标
    def compute_metrics(eval_preds):
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, predictions, average='micro')
        acc = accuracy_score(labels, predictions)
        result = {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

        return result

    #-------------------- 定义训练参数 --------------------#

    args = TrainingArguments(
        output_dir=saved_path,  # 保存路径，存放检查点和其他输出文件
        evaluation_strategy='steps',  # 每50steps结束后进行评价
        eval_steps=ALL_STEP,
        logging_strategy="steps",
        logging_steps=ALL_STEP,
        save_strategy="steps",
        save_steps=ALL_STEP,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to="tensorboard",
        # warmup_steps=500,  # 热身步数
        # weight_decay=0.01,  # 权重衰减
        learning_rate=LEARN_RATE,  # 初始学习率
        per_device_train_batch_size=BATCH_SIZE,  # 训练批次大小
        per_device_eval_batch_size=BATCH_SIZE,  # 测试批次大小
        num_train_epochs=4,  # 训练轮数
    )

    # 定义训练器
    trainer = Trainer(model,
                      args,
                      train_dataset=tokenized_datasets['train'],
                      eval_dataset=tokenized_datasets["dev"],
                      tokenizer=tokenizer,
                      compute_metrics=compute_metrics)

    # 开始训练

    trainer.train()

    # 训练完成以后的测试集评价
    trainer.evaluate(eval_dataset=tokenized_datasets['test'])


# 模型训练
train_model(model_path, class_num=CLASS_TAR)


# 循环训练参数
def save_set():
    pass


IndentationError: expected an indented block (86809510.py, line 126)

## 4.下游分类任务


In [7]:
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification

# 根据保存的模型进行预测
Cloud_Saved_Path = '/content/gdrive/MyDrive/Saved_Models/FinBERT_L-12_H-768_A-12/checkpoint-350/'
Local_Saved_Path = '/content/LocalSaved/'
Local_Saved_Name = 'FinBERT_L-12_H-768_A-12/'


# 加载云盘中训练好的模型到本地

def save_cloud_model():
    if not os.path.exists(Local_Saved_Path):
        os.makedirs(Local_Saved_Path)

    import shutil
    # 下载模型

    try:
        shutil.copytree(Cloud_Saved_Path, Local_Saved_Path + Local_Saved_Name)
    except FileExistsError as e:
        pass


save_cloud_model()


# 执行预测
def predict_from_list(texts_list: list) -> np.ndarray:
    # 模型路径
    model_path = Local_Saved_Path + Local_Saved_Name

    # 获取预训练的模型
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path,
                                                          num_labels=3)  # 3分类

    # 用分词器进行预处理
    encoded = tokenizer(texts_list,
                        truncation=True,
                        padding='max_length',
                        max_length=32,
                        return_tensors='pt')

    # 传入要预测的任务到分类模型
    out = model(**encoded)

    # softmax输出每一类的概率
    probs = out.logits.softmax(dim=-1)

    return probs.detach().numpy()


texts = [
    '枪击案嫌犯称最初目标并非安倍',
    '重庆一特斯拉失控 致多人伤亡',
    '湖南一医院坐椅子收费10元',
    '海航客机突发故障断电 机舱如蒸桑拿',
    '外卖小哥考上上海交大研究生',
]

predict_from_list(texts)


FileNotFoundError: ignored