In [None]:
!pip install pandas

In [None]:
!pip install bert4torch

In [None]:
!pip install scikit-learn

In [None]:
!pip install seaborn

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install safetensors


In [1]:

# 导入pandas库，用于数据处理和分析
import pandas as pd
# 从sklearn.model_selection导入train_test_split函数，用于划分训练集和测试集
from sklearn.model_selection import train_test_split
# 从collections导入Counter类，用于统计对象出现的次数
from collections import Counter
# 从matplotlib.font_manager导入FontProperties类，用于字体管理
from matplotlib.font_manager import FontProperties

In [2]:
# 从'/root/data/train_set.csv'路径读取训练集数据，使用制表符('\t')作为字段分隔符
train_df = pd.read_csv('/root/data/train_set.csv', sep='\t')
# 从'/root/data/test_a.csv'路径读取测试集数据，使用制表符('\t')作为字段分隔符
test_df = pd.read_csv('/root/data/test_a.csv', sep='\t')
# 给测试集添加一个名为'label'的列，其所有值都设为0
test_df['label'] = 0

In [3]:
# 显示训练集数据的前5行
train_df.head()

Unnamed: 0,label,text
0,2,2967 6758 339 2021 1854 3731 4109 3792 4149 15...
1,11,4464 486 6352 5619 2465 4802 1452 3137 5778 54...
2,3,7346 4068 5074 3747 5681 6093 1777 2226 7354 6...
3,2,7159 948 4866 2109 5520 2490 211 3956 5520 549...
4,3,3646 3055 3055 2490 4659 6065 3370 5814 2465 5...


In [4]:
# 获取训练集数据的行数
len(train_df)

200000

In [5]:
# 获取训练集中'text'列的第一个元素的长度和类型
len(train_df['text'][0]), type(train_df['text'][0])

(5120, str)

## 学术加速

In [6]:
# 导入subprocess模块，用于执行外部命令
import subprocess
# 导入os模块，用于操作环境变量
import os

# 执行外部命令，source一个脚本文件并筛选包含'proxy'的环境变量，捕获命令的输出
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
# 获取命令的标准输出
output = result.stdout
# 遍历输出的每一行
for line in output.splitlines():
    # 如果行中包含'='符号
    if '=' in line:
        # 分割变量名和值
        var, value = line.split('=', 1)
        # 将变量名和值设置到当前环境变量中
        os.environ[var] = value

## 重新训练分词器 Tokenizer

In [None]:
# 导入AutoTokenizer类
from transformers import AutoTokenizer

# 加载预训练的分词器
old_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# 定义获取训练语料库的函数，使用生成器分批次处理数据
def get_training_corpus():
    # 以1000为步长遍历训练数据
    for i in range(0, len(train_df), 1000):
        # 生成当前批次的数据
        yield train_df["text"][i : i + 1000]

# 使用旧的分词器的词表作为基础，在新的语料库上训练新的分词器
new_tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), 10000)

# 保存新的分词器到指定目录
new_tokenizer.save_pretrained('./new_tokenizer/')

In [None]:
# 获取训练集的第一条数据，并截取前十个单词
first_text = ' '.join(train_df.iloc[0]['text'].split()[:10])

# 打印原始文本
print("Original text:", first_text)
# 使用旧分词器进行分词，并打印结果
print("Old tokenizer:")
print(old_tokenizer.tokenize(first_text))
# 使用新分词器进行分词，并打印结果
print("New tokenizer:")
print(new_tokenizer.tokenize(first_text))

## 预处理数据集，方便MLM训练


In [7]:
# 导入AutoTokenizer类
from transformers import AutoTokenizer
# 导入Dataset类
from datasets import Dataset

# 加载重新训练好的分词器
tokenizer = AutoTokenizer.from_pretrained('./new_tokenizer/')

# 定义分词函数，直接截断超过最大长度的文本
def tokenize_function(examples):
    # 对输入的文本进行分词，设置填充到最大长度，超过最大长度则截断，最大长度设为512
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [None]:
# 从pandas DataFrame创建数据集
train_dataset = Dataset.from_pandas(train_df)

# 使用定义的分词函数对数据集进行分词处理，并移除原始文本和标签列
tokenized_datasets = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text", "label"]
)

# 现在tokenized_datasets已经准备好用于MLM预训练

In [None]:
# 导入Dataset类和load_from_disk函数
from datasets import Dataset, load_from_disk

# 将处理后的数据集保存到磁盘
tokenized_datasets.save_to_disk("tokenized_datasets")

In [8]:
# 导入load_from_disk函数
from datasets import load_from_disk

# 从磁盘加载处理后的数据集
tokenized_datasets = load_from_disk("tokenized_datasets")

In [9]:
#tokenized_datasets[1]

## MLM训练

In [8]:
# 导入os模块，用于操作系统功能，如环境变量、文件路径等
import os
# 导入torch模块，PyTorch框架的主模块，用于深度学习和张量计算
import torch
# 从transformers库导入AutoTokenizer和AutoModelForMaskedLM类，用于加载预训练的分词器和遮蔽语言模型
from transformers import AutoTokenizer, AutoModelForMaskedLM
# 从transformers库导入DataCollatorForLanguageModeling类，用于数据整理，特别是为语言模型准备数据
from transformers import DataCollatorForLanguageModeling
# 从transformers库导入Trainer和TrainingArguments类，用于模型训练和设置训练参数
from transformers import Trainer, TrainingArguments
# 从datasets库导入Dataset类，用于处理和准备数据集
from datasets import Dataset

In [11]:
# 检查CUDA可用性
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Current device:", torch.cuda.current_device())
    print("Device count:", torch.cuda.device_count())
    print("Device name:", torch.cuda.get_device_name(0))

# 清理CUDA缓存
torch.cuda.empty_cache()

CUDA available: True
Current device: 0
Device count: 1
Device name: NVIDIA GeForce RTX 3090


In [15]:
# 分割数据集为训练集和验证集
train_size = int(0.9 * len(tokenized_datasets))  # 计算训练集大小为数据集的90%
train_dataset = tokenized_datasets.select(range(train_size))  # 选择前90%作为训练集
eval_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))  # 选择剩余10%作为验证集

# 加载预训练的Masked Language Model
model_checkpoint = "distilbert-base-uncased"  # 指定预训练模型
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)  # 从预训练模型加载Masked Language Model

# 定义数据整理器
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,  # 指定分词器
    mlm=True,  # 启用遮蔽语言模型任务
    mlm_probability=0.15  # 设置遮蔽概率为15%
)

# 设置训练参数
batch_size = 64  # 设置批次大小，可以根据需要调整

# 设置训练参数
training_args = TrainingArguments(
    output_dir=f"./{model_checkpoint}-finetuned-MLM",  # 设置输出目录
    overwrite_output_dir=True,  # 允许覆盖输出目录
    num_train_epochs=10,  # 设置训练轮数
    eval_strategy="epoch",  # 设置评估策略为每轮结束时
    save_strategy="epoch",  # 设置保存策略为每轮结束时
    per_device_train_batch_size=32,  # 设置每个设备的训练批次大小
    per_device_eval_batch_size=32,  # 设置每个设备的评估批次大小
    gradient_accumulation_steps=2,  # 设置梯度累积步数，相当于批次大小为64
    learning_rate=2e-5,  # 设置学习率
    weight_decay=0.01,  # 设置权重衰减
    save_total_limit=2,  # 设置最多保存模型数量
    logging_dir="/root/tf-logs",  # 设置日志目录
    logging_strategy="steps",  # 设置日志记录策略为每步
    logging_steps=500,  # 设置每500步记录一次日志
    push_to_hub=False,  # 不推送到huggingface hub
    fp16=True,  # 启用混合精度训练
    no_cuda=False  # 不禁用CUDA
)

# 定义Trainer
trainer = Trainer(
    model=model,  # 指定模型
    args=training_args,  # 指定训练参数
    train_dataset=train_dataset,  # 指定训练数据集
    eval_dataset=eval_dataset,  # 指定验证数据集
    data_collator=data_collator,  # 指定数据整理器
    tokenizer=tokenizer,  # 指定分词器
)

# 开始训练
print("Starting MLM training...")
trainer.train()  # 执行训练

# 保存模型和分词器
trainer.save_model("./mlm_model")  # 保存模型到指定目录
tokenizer.save_pretrained("./mlm_model")  # 保存分词器到指定目录

print("MLM training completed and model saved.")  # 打印训练完成信息

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting MLM training...


Epoch,Training Loss,Validation Loss
0,3.4115,3.005223
2,2.4177,2.271334
4,2.1727,2.068398
6,2.0632,1.965433
8,2.0098,1.917827
9,2.0085,1.915435


MLM training completed and model saved.


ModuleNotFoundError: pynvml does not seem to be installed or it can't be imported.

In [9]:
# 禁用tokenizers并行化处理以避免警告和可能的死锁问题
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
# 导入torch库
import torch
# 打印当前CUDA设备的索引号
print(torch.cuda.current_device())
# 打印可用的CUDA设备数量
print(torch.cuda.device_count())
# 打印第一个CUDA设备的名称
print(torch.cuda.get_device_name(0))

## 微调训练

In [29]:
# 导入torch库，用于深度学习
import torch
# 从torch库导入nn模块，用于构建神经网络
from torch import nn
# 从transformers库导入AutoModel, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback，用于加载预训练模型和训练
from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer, TrainerCallback
# 导入Dataset类，用于处理数据集
from datasets import Dataset
# 导入pandas库，用于数据处理和分析
import pandas as pd
# 从sklearn.model_selection导入train_test_split函数，用于分割数据集
from sklearn.model_selection import train_test_split
# 从sklearn.metrics导入accuracy_score, f1_score，用于评估模型性能
from sklearn.metrics import accuracy_score, f1_score
# 导入matplotlib.pyplot库，用于绘图
import matplotlib.pyplot as plt
# 导入numpy库，用于数值计算
import numpy as np

# 以下导入重复，可以省略
# from datasets import Dataset
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, f1_score
# import pandas as pd
# import numpy as np

# 导入os库，用于操作系统功能，如文件路径操作
import os
# 从typing导入Dict, List, Tuple，用于类型注解，提高代码可读性
from typing import Dict, List, Tuple

In [33]:
# 导入torch库，用于深度学习中的张量计算和自动微分
import torch
# 从transformers库导入AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
# AutoTokenizer用于自动加载预训练的分词器
# AutoModelForSequenceClassification用于加载预训练的序列分类模型
# TrainingArguments用于配置训练参数
# Trainer用于训练模型
# DataCollatorWithPadding用于批处理时自动填充序列到相同长度
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding
)
# 导入Dataset类，用于处理数据集
from datasets import Dataset
# 从sklearn.model_selection导入train_test_split函数，用于分割数据集
from sklearn.model_selection import train_test_split
# 导入pandas库，用于数据处理和分析
import pandas as pd
# 导入numpy库，用于数值计算
import numpy as np
# 导入evaluate库，用于模型评估
import evaluate
# 从typing导入Dict，用于类型注解，提高代码可读性
from typing import Dict

In [3]:
# 设置设备为GPU如果可用，否则使用CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 根据指定的模型路径加载预训练的分词器
model_name = "./mlm_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 从指定路径加载训练数据集，并分割为训练集和验证集
train_df = pd.read_csv('/root/data/train_set.csv', sep='\t')
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# 将训练集和验证集的DataFrame转换为Dataset对象
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [25]:
# 定义一个函数用于对数据集中的文本进行分词
def tokenize_function(examples):
    # 对文本进行分词，设置最大长度为512，超出部分截断，不足部分填充
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# 使用定义的分词函数对训练数据集进行分词，移除原始文本列
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# 使用定义的分词函数对验证数据集进行分词，移除原始文本列
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/180000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [34]:
# 定义注意力机制类，继承自nn.Module
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        # 定义注意力网络结构
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),  # 线性层
            nn.Tanh(),  # Tanh激活函数
            nn.Linear(hidden_size, 1),  # 线性层，输出维度为1
            nn.Softmax(dim=1)  # Softmax激活函数，按维度1进行
        )

    def forward(self, encoder_outputs):
        # 计算注意力权重
        attention_weights = self.attention(encoder_outputs)
        # 计算上下文向量，通过权重加权求和
        context_vector = torch.sum(attention_weights * encoder_outputs, dim=1)
        return context_vector

# 定义自定义的分类模型类，继承自nn.Module
class CustomClassificationModel(nn.Module):
    def __init__(self, pretrained_model_name, num_labels):
        super().__init__()
        self.num_labels = num_labels  # 类别数
        # 加载预训练模型
        self.base_model = AutoModel.from_pretrained(pretrained_model_name)
        # 实例化注意力机制
        self.attention = Attention(self.base_model.config.hidden_size)
        self.dropout = nn.Dropout(0.1)  # Dropout层
        # 分类器线性层
        self.classifier = nn.Linear(self.base_model.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        # 通过基模型获取输出
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        # 通过注意力机制获取上下文向量
        context_vector = self.attention(sequence_output)
        # 应用dropout
        pooled_output = self.dropout(context_vector)
        # 通过分类器获取最终的logits
        logits = self.classifier(pooled_output)
        return logits

# 定义计算评估指标的函数
def compute_metrics(eval_pred: EvalPrediction) -> Dict[str, float]:
    logits, labels = eval_pred
    # 预测结果
    predictions = np.argmax(logits, axis=-1)
    # 计算准确率
    accuracy = accuracy_score(labels, predictions)
    # 计算加权F1分数
    f1 = f1_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1
    }

# 定义保存最佳模型的回调类
class SaveBestModelCallback(TrainerCallback):
    def __init__(self, save_path: str):
        self.save_path = save_path  # 保存路径
        self.best_f1 = 0.0  # 最佳F1分数

    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        metrics = kwargs.get('metrics', {})
        f1 = metrics.get('eval_f1', 0)
        
        # 如果当前F1分数超过之前的最佳分数，则保存模型
        if f1 > self.best_f1:
            self.best_f1 = f1
            model = kwargs.get('model')
            if model:
                model.save_pretrained(os.path.join(self.save_path, f"best_model_f1_{f1:.4f}"))
                print(f"New best model saved with F1 score: {f1:.4f}")
        
        return control

# 设置训练参数
training_args = TrainingArguments(
    output_dir="/root/newsclassification/checkpoints",  # 输出目录
    eval_strategy="epoch",  # 评估策略
    learning_rate=2e-5,  # 学习率
    per_device_train_batch_size=32,  # 训练批次大小
    per_device_eval_batch_size=32,  # 评估批次大小
    num_train_epochs=10,  # 训练轮数
    weight_decay=0.01,  # 权重衰减
    save_strategy="epoch",  # 保存策略
    save_total_limit=2,  # 最多保存模型数
    load_best_model_at_end=True,  # 训练结束时加载最佳模型
    metric_for_best_model="f1",  # 选择最佳模型的指标
)

# 建立标签与ID的映射
id2label = {0: '科技', 1: '股票', 2: '体育', 3: '娱乐', 4: '时政', 5: '社会', 
            6: '教育', 7: '财经', 8: '家居', 9: '游戏', 10: '房产', 
            11: '时尚', 12: '彩票', 13: '星座'}
label2id = {v: k for k, v in id2label.items()}

# 加载预训练模型，设置分类数和标签映射
model = AutoModelForSequenceClassification.from_pretrained(
    "./mlm_model",
    num_labels=14,
    id2label=id2label,
    label2id=label2id
)
# 初始化Trainer，设置模型、训练参数、数据集、分词器、评估指标和回调函数
save_best_model_callback = SaveBestModelCallback(save_path="/root/newsclassification/best_model")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[save_best_model_callback]
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ./mlm_model and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [35]:
# 训练模型
print("开始训练...")
trainer.train()

# 评估模型
print("评估模型...")
eval_results = trainer.evaluate()
print(f"评估结果: {eval_results}")

开始训练...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.185,0.180841,0.94655,0.946454
2,0.1401,0.14413,0.95735,0.957347
3,0.1031,0.139039,0.9613,0.961253
4,0.073,0.164554,0.9599,0.959851
5,0.0579,0.178419,0.9624,0.96241
6,0.0378,0.212027,0.9618,0.96181
7,0.0231,0.240215,0.96095,0.960927
8,0.0131,0.255313,0.9617,0.961683
9,0.0076,0.266968,0.96165,0.961631
10,0.0053,0.27626,0.9618,0.961766


New best model saved with F1 score: 0.9465
New best model saved with F1 score: 0.9573
New best model saved with F1 score: 0.9613
New best model saved with F1 score: 0.9624
评估模型...


评估结果: {'eval_loss': 0.17841918766498566, 'eval_accuracy': 0.9624, 'eval_f1': 0.9624101825614323, 'eval_runtime': 82.0643, 'eval_samples_per_second': 243.711, 'eval_steps_per_second': 7.616, 'epoch': 10.0}


## 推理部分

In [36]:
# 设置设备，如果CUDA可用则使用GPU，否则使用CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 从预训练模型路径加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./mlm_model")

# 从指定路径加载测试数据集
test_df = pd.read_csv('/root/data/test_a.csv', sep='\t')
# 将pandas DataFrame转换为Hugging Face的Dataset
test_dataset = Dataset.from_pandas(test_df)

# 定义tokenize函数，用于处理数据集中的文本
def tokenize_function(examples):
    # 对文本进行tokenize，设置最大长度，超出部分进行截断
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# 对测试数据集应用tokenize函数，移除原始文本列
tokenized_test = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [46]:
# 设置数据集的格式为PyTorch张量，指定需要转换的列
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask"])

In [47]:
# # 定义预测函数
# def predict(model, dataset):
#     model.eval()
#     dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
#     predictions = []
    
#     with torch.no_grad():
#         for batch in tqdm(dataloader, desc="Predicting"):
#             inputs = {k: v.to(device) for k, v in batch.items() if k != 'label'}
#             outputs = model(**inputs)
#             predictions.extend(outputs.logits.argmax(dim=-1).cpu().numpy())
    
#     return predictions


# # 加载并预测：最佳F1模型
# best_model = AutoModelForSequenceClassification.from_pretrained("/root/newsclassification/best_model/best_model_f1_0.9624")
# best_model.to(device)
# best_predictions = predict(best_model, tokenized_test)

# # 保存最佳F1模型的预测结果
# best_submission = pd.DataFrame({'label': best_predictions})
# best_submission.to_csv('/root/newsclassification/best_f1_submission.csv', index=False)
# print("Best F1 model predictions saved to /root/newsclassification/best_f1_submission.csv")

# 定义预测函数
def predict(model, dataset):
    model.eval()  # 将模型设置为评估模式
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)  # 创建数据加载器
    predictions = []  # 初始化预测结果列表
    
    with torch.no_grad():  # 不计算梯度，减少内存消耗
        for batch in tqdm(dataloader, desc="Predicting"):  # 遍历数据批次
            batch = {k: v.to(device) for k, v in batch.items()}  # 将数据移动到指定设备
            outputs = model(**batch)  # 获取模型输出
            predictions.extend(outputs.logits.argmax(dim=-1).cpu().numpy())  # 提取预测结果并转换为numpy数组
    
    return predictions  # 返回预测结果

# 加载最佳F1模型
best_model = AutoModelForSequenceClassification.from_pretrained("/root/newsclassification/best_model/best_model_f1_0.9624")
best_model.to(device)  # 将模型移动到指定设备
best_predictions = predict(best_model, tokenized_test)  # 使用最佳模型进行预测

# 保存最佳F1模型的预测结果到CSV文件
pd.DataFrame({'label': best_predictions}).to_csv('/root/newsclassification/best_f1_submission.csv', index=False)
print("Best F1 model predictions saved to /root/newsclassification/best_f1_submission.csv")  # 打印保存成功的消息

Predicting: 100%|██████████| 1563/1563 [03:01<00:00,  8.59it/s]


Best F1 model predictions saved to /root/newsclassification/best_f1_submission.csv


In [53]:
best_predictions[1]

2

In [44]:
# 导入PyTorch库
import torch
# 从PyTorch导入DataLoader，用于批量加载数据
from torch.utils.data import DataLoader
# 从transformers库导入AutoModelForSequenceClassification，用于序列分类任务
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# 导入datasets库的Dataset，用于处理数据集
from datasets import Dataset
# 导入pandas库，用于数据处理和CSV文件读写
import pandas as pd
# 导入tqdm库，用于显示进度条
from tqdm import tqdm

In [51]:
# 加载最终epoch的模型用于预测
final_model = AutoModelForSequenceClassification.from_pretrained("/root/newsclassification/checkpoints/checkpoint-56250")
final_model.to(device)  # 将模型移动到指定的设备上，比如GPU
final_predictions = predict(final_model, tokenized_test)  # 使用预测函数进行预测

# 将最终epoch模型的预测结果保存到CSV文件中
pd.DataFrame({'label': final_predictions}).to_csv('/root/newsclassification/final_epoch_submission.csv', index=False)
print("Final epoch model predictions saved to /root/newsclassification/final_epoch_submission.csv")  # 打印保存成功的消息

print("Prediction completed for both models.")  # 打印完成预测的消息

Predicting: 100%|██████████| 1563/1563 [03:02<00:00,  8.57it/s]


Final epoch model predictions saved to /root/newsclassification/final_epoch_submission.csv
Prediction completed for both models.


In [50]:
# 导入os模块，用于操作文件系统
import os

# 定义checkpoints文件夹的路径
checkpoint_dir = "/root/newsclassification/checkpoints"
# 定义best_model文件夹的路径
best_model_dir = "/root/newsclassification/best_model"

# 使用os.listdir列出checkpoints文件夹中的文件，并打印
print(f"checkpoints 文件夹中的文件: {os.listdir(checkpoint_dir)}")

# 使用os.listdir列出best_model文件夹中的文件，并打印
print(f"best_model 文件夹中的文件: {os.listdir(best_model_dir)}")

checkpoints 文件夹中的文件: ['runs', 'checkpoint-28125', 'checkpoint-56250']
best_model 文件夹中的文件: ['best_model_f1_0.9465', 'best_model_f1_0.9573', 'best_model_f1_0.9613', '.ipynb_checkpoints', 'best_model_f1_0.9624']


In [54]:
# 定义模型路径列表
model_paths = [
    "/root/newsclassification/checkpoints/checkpoint-56250",
    "/root/newsclassification/checkpoints/checkpoint-28125",
    "/root/newsclassification/best_model/best_model_f1_0.9613",
    "/root/newsclassification/best_model/best_model_f1_0.9624"
]

# 初始化用于存储所有模型预测结果的列表
all_predictions = []

# 遍历模型路径列表，加载每个模型并进行预测
for path in model_paths:
    # 从指定路径加载序列分类模型
    model = AutoModelForSequenceClassification.from_pretrained(path)
    model.to(device)  # 将模型移动到指定设备
    predictions = predict(model, tokenized_test)  # 调用预测函数获取预测结果
    all_predictions.append(predictions)  # 将预测结果添加到列表中

Predicting: 100%|██████████| 1563/1563 [03:02<00:00,  8.59it/s]
Predicting: 100%|██████████| 1563/1563 [03:03<00:00,  8.52it/s]
Predicting: 100%|██████████| 1563/1563 [03:03<00:00,  8.51it/s]
Predicting: 100%|██████████| 1563/1563 [03:03<00:00,  8.51it/s]


In [60]:
# 导入numpy和Counter类
import numpy as np
from collections import Counter

# 转置预测结果，使其形状为 (num_samples, num_models)
all_predictions = np.array(all_predictions).T

# 打印转置后的 all_predictions 的形状
print("After transpose:")
print(all_predictions.shape)

# 投票融合
final_predictions = []

for preds in all_predictions:
    # 对每个样本的预测结果进行投票，选择出现次数最多的类别
    most_common = Counter(preds).most_common(1)[0][0]
    final_predictions.append(most_common)

# 保存融合后的预测结果到CSV文件
pd.DataFrame({'label': final_predictions}).to_csv('/root/newsclassification/voting_submission.csv', index=False)
print("Voting model predictions saved to /root/newsclassification/voting_submission.csv")

After transpose:
(50000, 4)
Voting model predictions saved to /root/newsclassification/voting_submission.csv


In [56]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
import pandas as pd
from tqdm import tqdm
import numpy as np
from collections import Counter

In [59]:
np.array(all_predictions).shape

(4, 50000)