In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 6.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 44.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 26.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 56.6 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for

In [10]:
cd /content/drive/MyDrive/BERT

/content/drive/MyDrive/BERT


In [25]:
import warnings
warnings.filterwarnings('ignore')       # 隐藏警告！
from transformers import logging
logging.set_verbosity_warning()
import csv
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from tqdm import tqdm

In [6]:
torch.__version__

'1.11.0+cu113'

In [7]:
torch.cuda.is_available()

True

In [26]:
def read_tsv(input_file,columns):
    with open(input_file,"r",encoding="utf-8") as file:
        lines = []
        count = 1
        for line in file:
            if len(line.strip().split("\t")) != 1:
                lines.append([count]+line.strip().split("\t"))
                count += 1
        df = pd.DataFrame(lines)
        df.columns = columns
    return df

# 数据集读取
class bqDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()

        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 参数更新
        optim.step()

        iter_num += 1
        if (iter_num % 100 == 0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (
            epoch, iter_num, loss.item(), iter_num / total_iter * 100))

    print("Epoch: %d, Average training loss: %.4f" % (epoch, total_train_loss / len(train_loader)))


def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in val_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f" % (total_eval_loss / len(val_dataloader)))
    print("-------------------------------")

def predict():
    model.eval()
    test_predict = []
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        test_predict += list(np.argmax(logits, axis=1).flatten())
        
    return test_predict

In [None]:
corpus_name = ['lcqmc', 'bq_corpus', 'paws-x-zh'][0]

bq_train = read_tsv("data/" + corpus_name + "/train.tsv", ['index', 'question1', 'question2', 'label'])  
# ---------------------------------------------
#  paws读取时 train set 少了，为 49129 （49401
# ---------------------------------------------
q1_train = bq_train['question1']
q2_train = bq_train['question2']
train_label = bq_train['label']
bq_val = read_tsv("data/" + corpus_name + "/dev.tsv", ['index', 'question1', 'question2', 'label'])
q1_val = bq_val['question1']
q2_val = bq_val['question2']
val_label = bq_val['label']
bq_test = read_tsv("data/" + corpus_name + "/test.tsv", ['index', 'question1', 'question2'])
bq_test['label'] = 0
q1_test = bq_test['question1']
q2_test = bq_test['question2']
test_label = bq_test['label']


# input_ids：字的编码
# token_type_ids：标识是第一个句子还是第二个句子
# attention_mask：标识是不是填充

# transformers bert相关的模型使用和加载
from transformers import BertTokenizer

model_name = 'bert-base-chinese'

# 分词器，词典
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encoding = tokenizer(list(q1_train), list(q2_train),
                            truncation=True, padding=True, max_length=100)
val_encoding = tokenizer(list(q1_val), list(q2_val),
                            truncation=True, padding=True, max_length=100)
test_encoding = tokenizer(list(q1_test), list(q2_test), 
                            truncation=True, padding=True, max_length=100)

train_dataset = bqDataset(train_encoding, list(train_label))
val_dataset = bqDataset(val_encoding, list(val_label))
test_dataset = bqDataset(test_encoding, list(test_label))

from transformers import BertForNextSentencePrediction, AdamW, get_linear_schedule_with_warmup

model = BertForNextSentencePrediction.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=16, num_workers=6, pin_memory=True, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, num_workers=6, pin_memory=True, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, num_workers=6, pin_memory=True, shuffle=True)

# 优化方法
optim = AdamW(model.parameters(), lr=1e-5)
for epoch in range(1):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()
    torch.save(model.state_dict(), f'model_{epoch}.pt')

# 预测
test_label = predict()
with open('result/' + corpus_name + '.tsv', 'w') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['index', 'prediction'])
    for i in range(len(test_label)):
        tsv_writer.writerow([str(i), str(test_label[i])])

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 0.3819, 0.67%
epoth: 0, iter_num: 200, loss: 0.1078, 1.34%
epoth: 0, iter_num: 300, loss: 0.2957, 2.01%
epoth: 0, iter_num: 400, loss: 0.2816, 2.68%
epoth: 0, iter_num: 500, loss: 0.3192, 3.35%
epoth: 0, iter_num: 600, loss: 0.4785, 4.02%
epoth: 0, iter_num: 700, loss: 0.1929, 4.69%
epoth: 0, iter_num: 800, loss: 0.1609, 5.36%
epoth: 0, iter_num: 900, loss: 0.3505, 6.03%
epoth: 0, iter_num: 1000, loss: 0.0408, 6.70%
epoth: 0, iter_num: 1100, loss: 0.2830, 7.37%
epoth: 0, iter_num: 1200, loss: 0.4989, 8.04%
epoth: 0, iter_num: 1300, loss: 0.0529, 8.71%
epoth: 0, iter_num: 1400, loss: 0.0838, 9.38%
epoth: 0, iter_num: 1500, loss: 0.1795, 10.05%
epoth: 0, iter_num: 1600, loss: 0.1016, 10.72%
epoth: 0, iter_num: 1700, loss: 0.1392, 11.39%
epoth: 0, iter_num: 1800, loss: 0.3273, 12.06%
epoth: 0, iter_num: 1900, loss: 0.2368, 12.73%
epoth: 0, iter_num: 2000, loss: 0.2494, 13.40%
epoth: 0, iter_num: 2100, loss: 0.0600, 14.07