In [1]:
# pandas 数据集读取，dataframe形式的
import pandas as pd
# 文件读取
import codecs
train_df = pd.read_csv('train.csv', sep='\t', names=['question1', 'question2', 'label'])

In [2]:
train_df.head(5)

Unnamed: 0,question1,question2,label
0,有哪些女明星被潜规则啦,哪些女明星被潜规则了,1
1,怎么支付宝绑定银行卡？,银行卡怎么绑定支付宝,1
2,请问这部电视剧叫什么名字,请问谁知道这部电视剧叫什么名字,1
3,泰囧完整版下载,エウテルペ完整版下载,0
4,在沧州市区哪家卖的盐焗鸡好吃？,沧州饭店哪家便宜又好吃又实惠,0


In [3]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
import re

In [4]:
# 划分为训练集和验证集
# stratify 按照标签进行采样，训练集和验证部分同分布
q1_train, q1_val, q2_train, q2_val, train_label, test_label =  train_test_split(
    train_df['question1'].iloc[:], 
    train_df['question2'].iloc[:],
    train_df['label'].iloc[:],
    test_size=0.1, 
    stratify=train_df['label'].iloc[:])

In [5]:
# input_ids：字的编码
# token_type_ids：标识是第一个句子还是第二个句子
# attention_mask：标识是不是填充

In [6]:
# pip install transformers
# transformers bert相关的模型使用和加载
from transformers import BertTokenizer
# 分词器，词典

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_encoding = tokenizer(list(q1_train), list(q2_train), 
                           truncation=True, padding=True, max_length=100)
val_encoding = tokenizer(list(q1_val), list(q2_val), 
                          truncation=True, padding=True, max_length=100)



In [7]:
# 数据集读取
class XFeiDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = XFeiDataset(train_encoding, list(train_label))
val_dataset = XFeiDataset(val_encoding, list(test_label))

In [8]:
# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [9]:
from transformers import BertForNextSentencePrediction, AdamW, get_linear_schedule_with_warmup
model = BertForNextSentencePrediction.from_pretrained('bert-base-chinese')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=True)

# 优化方法
optim = AdamW(model.parameters(), lr=1e-5)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 参数更新
        optim.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in val_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(val_dataloader)))
    print("-------------------------------")
    

for epoch in range(5):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()
    torch.save(model.state_dict(), f'model_{epoch}.pt')

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 0.0017, 8.89%
epoth: 0, iter_num: 200, loss: 0.0006, 17.78%
epoth: 0, iter_num: 300, loss: 0.0004, 26.67%
epoth: 0, iter_num: 400, loss: 0.0013, 35.56%
epoth: 0, iter_num: 500, loss: 0.0002, 44.44%
epoth: 0, iter_num: 600, loss: 1.2181, 53.33%
epoth: 0, iter_num: 700, loss: 0.0142, 62.22%
epoth: 0, iter_num: 800, loss: 1.5131, 71.11%
epoth: 0, iter_num: 900, loss: 0.0013, 80.00%
epoth: 0, iter_num: 1000, loss: 3.3881, 88.89%
epoth: 0, iter_num: 1100, loss: 0.0013, 97.78%
Epoch: 0, Average training loss: 0.3522
Accuracy: 0.8960
Average testing loss: 0.5002
-------------------------------
------------Epoch: 1 ----------------
epoth: 1, iter_num: 100, loss: 0.0004, 8.89%
epoth: 1, iter_num: 200, loss: 0.0003, 17.78%
epoth: 1, iter_num: 300, loss: 0.0019, 26.67%
epoth: 1, iter_num: 400, loss: 0.0006, 35.56%
epoth: 1, iter_num: 500, loss: 0.0002, 44.44%
epoth: 1, iter_num: 600, loss: 0.0013, 53.33%
epoth: 1, iter_num: 700,

KeyboardInterrupt: 

In [10]:
# torch.save(model.state_dict(), f'model_{epoch}.pt')
model.load_state_dict(torch.load(f'model_0.pt'))

<All keys matched successfully>

In [11]:
test_df = pd.read_csv('test.csv', sep='\t', names=['question1', 'question2', 'label'])
test_df['label'] = test_df['label'].fillna(0)
test_df.head()

Unnamed: 0,question1,question2,label
0,玩梦幻西游能赚钱吗,梦幻西游2不花钱能玩吗,0.0
1,夏天去什么地方旅游好,夏季去什么地方旅游最好（国内的地方）,0.0
2,为什么梦幻西游网站打不开,为什么下载梦幻西游游戏补丁网页打不开,0.0
3,这对双胞胎像不,像这样的可爱卡通图片要男的谢了,0.0
4,免费网络游戏都有哪些?,有哪些免费的网络游戏,0.0


In [12]:
test_encoding = tokenizer(list(test_df['question1']), list(test_df['question2']), 
                          truncation=True, padding=True, max_length=100)

test_dataset = XFeiDataset(test_encoding, list(test_df['label']))
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [13]:
def predict():
    model.eval()
    test_predict = []
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        test_predict += list(np.argmax(logits, axis=1).flatten())
        
    return test_predict

In [14]:
test_label = predict()
pd.DataFrame({'label':test_label}).to_csv('submit.csv', index=None)

In [3]:
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input, decode_predictions
import numpy as np

model = VGG16(weights=None)