In [1]:
# pandas 数据集读取，dataframe形式的
import pandas as pd
# 文件读取
import codecs

train_df = pd.read_csv('./基于论文摘要的文本分类与查询性问答公开数据/train.csv', sep=',')
test_df = pd.read_csv('./基于论文摘要的文本分类与查询性问答公开数据/test.csv', sep=',')
train_df = train_df[~train_df['Topic(Label)'].isnull()]
train_df['Topic(Label)'], lbl = pd.factorize(train_df['Topic(Label)'])

train_df['Title'] = train_df['Title'].apply(lambda x: x.strip())
train_df['Abstract'] = train_df['Abstract'].fillna(
    '').apply(lambda x: x.strip())
train_df['text'] = train_df['Title'] + ' ' + train_df['Abstract']
train_df['text'] = train_df['text'].str.lower()

test_df['Title'] = test_df['Title'].apply(lambda x: x.strip())
test_df['Abstract'] = test_df['Abstract'].fillna('').apply(lambda x: x.strip())
test_df['text'] = test_df['Title'] + ' ' + test_df['Abstract']
test_df['text'] = test_df['text'].str.lower()

In [2]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
import re

In [3]:
# pip install transformers
# transformers bert相关的模型使用和加载
from transformers import AutoTokenizer
# 分词器，词典

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_encoding = tokenizer(train_df['text'].to_list()[:], truncation=True, padding=True, max_length=512)
test_encoding = tokenizer(test_df['text'].to_list()[:], truncation=True, padding=True, max_length=512)



In [4]:
# 数据集读取
class XunFeiDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
                for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = XunFeiDataset(train_encoding, train_df['Topic(Label)'].to_list())
test_dataset = XunFeiDataset(test_encoding, [0] * len(test_df))

# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [5]:
# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [6]:
from transformers import AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=12)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 优化方法
optim = AdamW(model.parameters(), lr=1e-5)
total_steps = len(train_loader) * 1

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()

        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 参数更新
        optim.step()
        # scheduler.step()

        iter_num += 1
        if(iter_num % 100 == 0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" %
                  (epoch, iter_num, loss.item(), iter_num/total_iter*100))

    print("Epoch: %d, Average training loss: %.4f" %
          (epoch, total_train_loss/len(train_loader)))


def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(
                input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f" %
          (total_eval_loss/len(test_dataloader)))
    print("-------------------------------")

for epoch in range(2):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    # validation()

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 2.2585, 4.85%
epoth: 0, iter_num: 200, loss: 1.7927, 9.69%
epoth: 0, iter_num: 300, loss: 0.9374, 14.54%
epoth: 0, iter_num: 400, loss: 0.4790, 19.39%
epoth: 0, iter_num: 500, loss: 0.5791, 24.24%
epoth: 0, iter_num: 600, loss: 0.3968, 29.08%
epoth: 0, iter_num: 700, loss: 0.2969, 33.93%
epoth: 0, iter_num: 800, loss: 0.3103, 38.78%
epoth: 0, iter_num: 900, loss: 0.0524, 43.63%
epoth: 0, iter_num: 1000, loss: 0.0366, 48.47%
epoth: 0, iter_num: 1100, loss: 0.2020, 53.32%
epoth: 0, iter_num: 1200, loss: 0.0448, 58.17%
epoth: 0, iter_num: 1300, loss: 0.6537, 63.02%
epoth: 0, iter_num: 1400, loss: 0.0215, 67.86%
epoth: 0, iter_num: 1500, loss: 0.8063, 72.71%
epoth: 0, iter_num: 1600, loss: 0.0250, 77.56%
epoth: 0, iter_num: 1700, loss: 0.1452, 82.40%
epoth: 0, iter_num: 1800, loss: 0.2006, 87.25%
epoth: 0, iter_num: 1900, loss: 0.0123, 92.10%
epoth: 0, iter_num: 2000, loss: 0.8850, 96.95%
Epoch: 0, Average training loss: 

KeyboardInterrupt: 

In [8]:
def prediction():
    model.eval()
    test_label = []
    for batch in test_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            pred = model(input_ids, attention_mask).logits
            test_label += list(pred.argmax(1).data.cpu().numpy())
    return test_label

In [9]:
test_predict = prediction()

In [10]:
test_df['Topic(Label)'] = [lbl[x] for x in test_predict]
test_df[['Topic(Label)']].to_csv('bert_submit3.csv', index=None)