In [81]:
# pandas 数据集读取，dataframe形式的
import pandas as pd
import codecs

train_df = pd.read_csv('./中文对话文本匹配挑战赛数据集/train.csv', sep='\t', header=None, nrows=None)
test_df = pd.read_csv('./中文对话文本匹配挑战赛数据集/test.csv', sep='\t', header=None)

# train_df = train_df[train_df['question2'].apply(lambda x: isinstance(x, str))]
# train_df = train_df[train_df['question1'].apply(lambda x: isinstance(x, str))]

In [82]:
train_df.head(5)

Unnamed: 0,0,1,2
0,藏獒为什么这么贵,藏獒见人不咬为什么,0
1,人生应该怎么才算精彩？,人生要怎么过才算精彩啊,1
2,为什么打牌老是输,为什么我枪神纪进不去了,0
3,现在网上卖什么最赚钱,网上卖什么最赚钱,1
4,如何提高气质,怎样提高自身气质？,1


In [83]:
test_df.head(5)

Unnamed: 0,0,1
0,成语中的历史人物,成语有关的历史人物
1,黄财神怎样供奉,怎样供奉财神
2,进门是餐厅好吗,进门见餐厅好吗
3,怎么提高理解力？,怎样提高理解力
4,仓鼠用什么磨牙都可以吗,用什么可以代替仓鼠的磨牙棒


In [84]:
train_df[2].value_counts()

1    29039
0    20961
Name: 2, dtype: int64

In [85]:
train_df.shape

(50000, 3)

In [86]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
import re

In [87]:
# 划分为训练集和验证集
# stratify 按照标签进行采样，训练集和验证部分同分布
q1_train, q1_val, q2_train, q2_val, train_label, val_label = train_test_split(
    train_df[0].iloc[:],
    train_df[1].iloc[:],
    train_df[2].iloc[:],
    test_size=0.1,
    stratify=train_df[2].iloc[:])

In [88]:
# input_ids：字的编码
# token_type_ids：标识是第一个句子还是第二个句子
# attention_mask：标识是不是填充

In [89]:
# pip install transformers
# transformers bert相关的模型使用和加载
from transformers import BertTokenizer
# 分词器，词典
import transformers
transformers.logging.set_verbosity_error()

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_encoding = tokenizer(list(q1_train), list(q2_train),
                           truncation=True, padding=True, max_length=32)

val_encoding = tokenizer(list(q1_val), list(q2_val),
                          truncation=True, padding=True, max_length=32)

test_encoding = tokenizer(list(test_df[0]), list(test_df[1]),
                          truncation=True, padding=True, max_length=32)

In [90]:
# 数据集读取
class QuoraDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = QuoraDataset(train_encoding, list(train_label))
val_dataset = QuoraDataset(val_encoding, list(val_label))
test_dataset = QuoraDataset(test_encoding, [0] * len(test_df))

In [91]:
# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [92]:
from transformers import BertForNextSentencePrediction, AdamW, get_linear_schedule_with_warmup
model = BertForNextSentencePrediction.from_pretrained('bert-base-chinese')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 优化方法
optim = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 1



In [None]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()

        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 参数更新
        optim.step()

        iter_num += 1
        if(iter_num % 100 == 0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))

    print("Epoch: %d, Average training loss: %.4f" % (epoch, total_train_loss/len(train_loader)))


def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in val_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average validation loss: %.4f" % (total_eval_loss/len(val_dataloader)))


for epoch in range(3):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 0.0928, 3.55%
epoth: 0, iter_num: 200, loss: 0.4208, 7.11%
epoth: 0, iter_num: 300, loss: 0.1421, 10.66%
epoth: 0, iter_num: 400, loss: 0.3197, 14.22%
epoth: 0, iter_num: 500, loss: 0.2212, 17.77%
epoth: 0, iter_num: 600, loss: 0.3806, 21.33%
epoth: 0, iter_num: 700, loss: 0.3617, 24.88%
epoth: 0, iter_num: 800, loss: 0.1321, 28.44%
epoth: 0, iter_num: 900, loss: 0.6039, 31.99%
epoth: 0, iter_num: 1000, loss: 0.1054, 35.55%
epoth: 0, iter_num: 1100, loss: 0.3961, 39.10%
epoth: 0, iter_num: 1200, loss: 0.3404, 42.66%
epoth: 0, iter_num: 1300, loss: 0.4050, 46.21%
epoth: 0, iter_num: 1400, loss: 0.5042, 49.77%
epoth: 0, iter_num: 1500, loss: 0.1580, 53.32%
epoth: 0, iter_num: 1600, loss: 0.4447, 56.88%
epoth: 0, iter_num: 1700, loss: 0.1941, 60.43%
epoth: 0, iter_num: 1800, loss: 0.2821, 63.99%
epoth: 0, iter_num: 1900, loss: 0.3955, 67.54%
epoth: 0, iter_num: 2000, loss: 0.2193, 71.10%
epoth: 0, iter_num: 2100, loss: 0

In [78]:
def prediciton():
    model.eval()
    prediction_list = []
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        logits = outputs[1]
        
        preds = logits.detach().cpu().numpy()
        pred_flat = np.argmax(preds, axis=1).flatten()
        prediction_list += list(pred_flat)
    return prediction_list

In [79]:
test_result = prediciton()

In [80]:
pd.DataFrame(test_result).to_csv('submit.csv', header=None, index=None)