In [1]:
import jsonlines
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from scipy.stats import spearmanr
import random
from colorama import Fore, Style
import copy

In [2]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


RANDOM_SEED = 42
set_seed(RANDOM_SEED)

In [3]:
MAXLEN = 64
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [4]:
def load_snli_data(filename):
    with open(filename, 'r', encoding='utf8') as f:
        return [(line['origin'], line['entailment'], line['contradiction']) for line in jsonlines.Reader(f)]


def load_sts_data(filename):
    with open(filename, 'r', encoding='utf8') as f:
        return [(line.split("||")[1], line.split("||")[2], line.split("||")[3]) for line in f]


train_data = load_snli_data('cnsd-snli-process/train.jsonl')
dev_data = load_sts_data('STS-B/cnsd-sts-dev.txt')
test_data = load_sts_data('STS-B/cnsd-sts-test.txt')

print(train_data[0])
print(dev_data[0])

('一个女人正走在街对面吃香蕉，而一个男人正紧跟在他的公文包后面。', '那个女人在吃香蕉。', '一个女人走在人行道上吃冰淇淋，还有一个女人拿着钱包在她面前。')
('一个戴着安全帽的男人在跳舞。', '一个戴着安全帽的男人在跳舞。', '5\n')


In [5]:
TOKENIZER = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
print(TOKENIZER.model_input_names)
print(TOKENIZER.all_special_tokens)

['input_ids', 'token_type_ids', 'attention_mask']
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


In [6]:
class TrainDataset(Dataset):
    """定义训练数据集"""

    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def text_2_id(self, text):
        return TOKENIZER([text[0], text[1], text[2]], max_length=MAXLEN,
                         truncation=True, padding='max_length', return_tensors='pt')

    def __getitem__(self, index: int):
        return self.text_2_id(self.data[index])


train_dataloader = DataLoader(TrainDataset(train_data), batch_size=BATCH_SIZE, shuffle=True)
for i in train_dataloader:
    # i['input_ids'].shape=[batch_size, 3, MAXLEN]
    print(i['input_ids'].shape)
    break

torch.Size([32, 3, 64])


In [7]:
class TestDataset(Dataset):
    """定义测试数据集"""

    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def text_2_id(self, text):
        return TOKENIZER(text, max_length=MAXLEN, truncation=True, padding='max_length', return_tensors='pt')

    def __getitem__(self, index):
        line = self.data[index]
        return self.text_2_id([line[0]]), self.text_2_id([line[1]]), int(line[2])


dev_dataloader = DataLoader(TestDataset(dev_data), batch_size=BATCH_SIZE)
for i, j, k in dev_dataloader:
    print(i['input_ids'].shape)
    print(j['input_ids'].shape)
    print(k)  # k.shape=[batch_size]
    break

torch.Size([32, 1, 64])
torch.Size([32, 1, 64])
tensor([5, 4, 2, 2, 2, 5, 2, 5, 3, 1, 5, 4, 0, 2, 5, 4, 3, 1, 3, 2, 1, 1, 4, 1,
        2, 5, 5, 4, 3, 5, 5, 1])


In [8]:
class SimcseModel(nn.Module):
    """Simcse有监督模型"""

    def __init__(self, pretrained_model, pooling):
        super(SimcseModel, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model)  # 有监督不需要修改dropout
        self.pooling = pooling

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.bert(input_ids, attention_mask, token_type_ids, output_hidden_states=True)
        # type(out.hidden_states):元组类型
        # out.last_hidden_state.shape=[batch_size, seq_len, hidden_size]

        if self.pooling == 'cls':
            return out.last_hidden_state[:, 0]  # shape=[batch_size, hidden_size]

        if self.pooling == 'pooler':
            return out.pooler_output  # shape=[batch_size, hidden_size]

        if self.pooling == 'last-avg':
            last = out.last_hidden_state.transpose(1, 2)  # shape=[batch_size, hidden_size, seq_len]
            return torch.avg_pool1d(last, kernel_size=last.shape[-1]).squeeze(-1)  # shape=[batch, hidden_size]

        if self.pooling == 'first-last-avg':
            first = out.hidden_states[1].transpose(1, 2)  # shape=[batch_size, hidden_size, seq_len]
            last = out.hidden_states[-1].transpose(1, 2)  # shape=[batch_size, hidden_size, seq_len]                   
            first_avg = torch.avg_pool1d(first, kernel_size=last.shape[-1]).squeeze(
                -1)  # shape=[batch_size, hidden_size]
            last_avg = torch.avg_pool1d(last, kernel_size=last.shape[-1]).squeeze(-1)  # shape=[batch_size, hidden_size]
            avg = torch.cat((first_avg.unsqueeze(1), last_avg.unsqueeze(1)),
                            dim=1)  # shape=[batch_size, 2, hidden_size]
            return torch.avg_pool1d(avg.transpose(1, 2), kernel_size=2).squeeze(
                -1)  # shape=[batch_size, hidden_size]

In [9]:
def simcse_sup_loss(y_pred, temperature=0.05):
    """损失函数(有监督)"""
    # y_pred.shape=[BATCH_SIZE * 3, hidden_size]
    y_true = torch.arange(y_pred.shape[0], device=DEVICE)
    # example:[0,1, 3,4, 6,7, 9,10, xxxxxx]
    use_row = torch.where((y_true + 1) % 3 != 0)[0]
    # example:[1,0, 4,3, 7,6, 10,9, xxxxxx]
    y_true = (use_row - use_row % 3 * 2) + 1
    # sim.shape=[BATCH_SIZE * 3, BATCH_SIZE * 3]
    sim = F.cosine_similarity(y_pred.unsqueeze(1), y_pred.unsqueeze(0), dim=-1)
    # 相似度矩阵对角线处元素设置为很小的值(消除自身影响)
    sim = sim - torch.eye(y_pred.shape[0], device=DEVICE) * 1e12
    # 选取相似度矩阵的[0,1, 3,4, 6,7, 9,10, xxxxxx]行
    sim = torch.index_select(sim, 0, use_row)
    sim = sim / temperature
    # 交叉熵损失的的target为:[1,0, 4,3, 7,6, 10,9, xxxxxx]
    # 理解:x(本代码数据中的'origin')将x^+(本代码数据中的'entailment')作为正样本,将x^-(本代码数据中的'contradiction')与其他句子的x^+与x^-作为负样本
    loss = F.cross_entropy(sim, y_true)
    return torch.mean(loss)

In [10]:
model = SimcseModel(pretrained_model='hfl/chinese-roberta-wwm-ext', pooling='cls')
model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# 模型验证
def eval(model, dataloader):
    model.eval()

    sim_tensor = torch.tensor([], device=DEVICE)
    label_array = np.array([])

    with torch.no_grad():
        for source, target, label in dataloader:
            # label.shape=[batch_size]
            # source_input_ids.shape=[batch_size, seq_len]
            source_input_ids = source['input_ids'].squeeze(1).to(DEVICE)
            source_attention_mask = source['attention_mask'].squeeze(1).to(DEVICE)
            source_token_type_ids = source['token_type_ids'].squeeze(1).to(DEVICE)
            # source_pred.shape=[batch_size, hidden_size]
            source_pred = model(source_input_ids, source_attention_mask, source_token_type_ids)

            # target_input_ids.shape=[batch_size, seq_len]
            target_input_ids = target['input_ids'].squeeze(1).to(DEVICE)
            target_attention_mask = target['attention_mask'].squeeze(1).to(DEVICE)
            target_token_type_ids = target['token_type_ids'].squeeze(1).to(DEVICE)
            # target_pred.shape=[batch_size, hidden_size]
            target_pred = model(target_input_ids, target_attention_mask, target_token_type_ids)
            # sim.shape=[batch_size]
            sim = F.cosine_similarity(source_pred, target_pred, dim=-1)  # result:是否相似
            sim_tensor = torch.cat((sim_tensor, sim), dim=0)
            label_array = np.append(label_array, np.array(label))

    return spearmanr(label_array, sim_tensor.cpu().numpy()).correlation  # 斯皮尔曼相关系数(无序)

In [12]:
# 模型训练与评估
def train(model, train_dl, dev_dl, optimizer, best):
    model.train()

    early_stop_batch = 0
    for batch_idx, source in enumerate(train_dl, start=1):
        real_batch_num = source['input_ids'].shape[0]
        input_ids = source['input_ids'].view(real_batch_num * 3, -1).to(DEVICE)
        attention_mask = source['attention_mask'].view(real_batch_num * 3, -1).to(DEVICE)
        token_type_ids = source['token_type_ids'].view(real_batch_num * 3, -1).to(DEVICE)

        out = model(input_ids, attention_mask, token_type_ids)
        loss = simcse_sup_loss(out)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 每训练10个step进行一次模型验证
        if batch_idx % 10 == 0:
            print('| step {:5d} | loss {:8.5f} |'.format(batch_idx, loss.item()))
            corrcoef = eval(model, dev_dl)
            model.train()
            if best[0] < corrcoef:
                best.clear()
                best.append(corrcoef)
                best.append(copy.deepcopy(model.state_dict()))
                early_stop_batch = 0
                continue

            early_stop_batch += 1
            if early_stop_batch == 20:  # 早停step步数为20 * 10
                print(Fore.RED + f"corrcoef doesn't improve for {early_stop_batch} batch, early stop!")
                print(Style.RESET_ALL, end='')
                break

In [13]:
best = [0.0, None]  # 验证数据集最优相关系数与对应模型的状态字典
for epoch in range(2):
    print('*' * 20 + str(epoch) + '*' * 20)
    train(model, train_dataloader, dev_dataloader, optimizer, best)

********************0********************
| step    10 | loss  1.80268 |
| step    20 | loss  0.95212 |
| step    30 | loss  1.56213 |
| step    40 | loss  1.04055 |
| step    50 | loss  1.71093 |
| step    60 | loss  1.46353 |
| step    70 | loss  0.81570 |
| step    80 | loss  1.18023 |
| step    90 | loss  0.94789 |
| step   100 | loss  0.69612 |
| step   110 | loss  0.94707 |
| step   120 | loss  1.37303 |
| step   130 | loss  1.32752 |
| step   140 | loss  0.98692 |
| step   150 | loss  0.84953 |
| step   160 | loss  1.46934 |
| step   170 | loss  1.24137 |
| step   180 | loss  1.28791 |
| step   190 | loss  1.34628 |
| step   200 | loss  0.84407 |
| step   210 | loss  1.12489 |
| step   220 | loss  0.68530 |
| step   230 | loss  0.67318 |
| step   240 | loss  1.23049 |
| step   250 | loss  1.17653 |
| step   260 | loss  0.97337 |
| step   270 | loss  1.24604 |
| step   280 | loss  1.13769 |
| step   290 | loss  0.58476 |
| step   300 | loss  1.01274 |
| step   310 | loss  0.60883

In [14]:
best[0]

0.8154346648730026

In [15]:
test_dataloader = DataLoader(TestDataset(test_data), batch_size=BATCH_SIZE)

In [19]:
best_model = SimcseModel(pretrained_model='hfl/chinese-roberta-wwm-ext', pooling='cls')
best_model.load_state_dict(best[1])
best_model.to(DEVICE)

# 最优
dev_corrcoef_best = eval(best_model, dev_dataloader)
test_corrcoef_best = eval(best_model, test_dataloader)
print(f'dev_corrcoef: {dev_corrcoef_best:.8f}')
print(f'test_corrcoef: {test_corrcoef_best:.8f}')

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dev_corrcoef: 0.81543466
test_corrcoef: 0.78066004


In [20]:
dev_corrcoef_last = eval(model, dev_dataloader)
test_corrcoef_last = eval(model, test_dataloader)

# 未早停
print(f'dev_corrcoef: {dev_corrcoef_last:.8f}')
print(f'test_corrcoef: {test_corrcoef_last:.8f}')

dev_corrcoef: 0.80799295
test_corrcoef: 0.77897552


In [21]:
model_init = SimcseModel(pretrained_model='hfl/chinese-roberta-wwm-ext', pooling='cls')
model_init.to(DEVICE)
dev_corrcoef_init = eval(model_init, dev_dataloader)
test_corrcoef_init = eval(model_init, test_dataloader)

# 未训练
print(f'dev_corrcoef: {dev_corrcoef_init:.8f}')
print(f'test_corrcoef: {test_corrcoef_init:.8f}')

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dev_corrcoef: 0.71675599
test_corrcoef: 0.68252043
