In [1]:
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, BertConfig
import torch
import numpy as np
import random
import jsonlines
import torch.nn.functional as F
from scipy.stats import spearmanr
import copy

In [2]:
def set_seed(seed):
    """PyTorch随机数种子设置大全"""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)  # CPU上设置随机种子
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)  # 当前GPU上设置随机种子
        # A bool that, if True, causes cuDNN to only use deterministic convolution algorithms.
        torch.backends.cudnn.deterministic = True
        # torch.cuda.manual_seed_all(seed) # 所有GPU上设置随机种子


RANDOM_SEED = 42
set_seed(RANDOM_SEED)

In [3]:
BATCH_SIZE = 128
MAXLEN = 64
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [4]:
def load_snli_data(filename):
    with open(filename, 'r', encoding='utf8') as f:
        return [line['origin'] for line in jsonlines.Reader(f)]


def load_sts_data(filename):
    with open(filename, 'r', encoding='utf8') as f:            
        return [(line.split("||")[1], line.split("||")[2], line.split("||")[3]) for line in f]   
    

# 增大预料规模,同时模型训练的同时增大batch_size
# 解释:语料[a0, a1, a2, ......, an]应该是各不相同的,现实语料中存在相似样本
# 如:a_v0, a_v0^'互为正样本,剩下a_v1, a_v1^',a_v2, a_v2^',......为负样本,此时若a_v1, a_v1^',a_v2, a_v2^',......中存在相似样本,则将产生干扰
train_data_snli = load_snli_data('cnsd-snli-process/train.jsonl')
train_data_sts = load_sts_data('STS-B/cnsd-sts-train.txt')
train_data = train_data_snli + [_[0] for _ in train_data_sts]  # 两个数据集数据组合

dev_data = load_sts_data('STS-B/cnsd-sts-dev.txt')
test_data = load_sts_data('STS-B/cnsd-sts-test.txt')   

print(train_data[0])
print(dev_data[0])

一个女人正走在街对面吃香蕉，而一个男人正紧跟在他的公文包后面。
('一个戴着安全帽的男人在跳舞。', '一个戴着安全帽的男人在跳舞。', '5\n')


In [5]:
TOKENIZER = AutoTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')
print(TOKENIZER.model_input_names)
print(TOKENIZER.all_special_tokens)

['input_ids', 'token_type_ids', 'attention_mask']
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


In [6]:
class TrainDataset(Dataset):
    """定义训练数据集"""
    
    def __init__(self, data):
        self.data = data
      
    def __len__(self):
        return len(self.data)
    
    def text_2_id(self, text):
        # 将text分别输入到编码器中两次(每一个句子进行两次前向传播,得到两个不同的embeddings向量,互为正样本)
        return TOKENIZER([text, text], max_length=MAXLEN, truncation=True, padding='max_length', return_tensors='pt')
    
    def __getitem__(self, index):
        return self.text_2_id(self.data[index])
    
    
train_dataloader = DataLoader(TrainDataset(train_data), batch_size=BATCH_SIZE, shuffle=True)
for i in train_dataloader:
    print(i['input_ids'].shape)  # i['input_ids'].shape=[batch_size, 2, seq_len]
    break

torch.Size([128, 2, 64])


In [7]:
class TestDataset(Dataset):
    """定义测试数据集"""
    
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def text_2_id(self, text):
        return TOKENIZER(text, max_length=MAXLEN, truncation=True, padding='max_length', return_tensors='pt')
    
    def __getitem__(self, index):
        da = self.data[index]        
        return self.text_2_id([da[0]]), self.text_2_id([da[1]]), int(da[2])

    
dev_dataloader = DataLoader(TestDataset(dev_data), batch_size=BATCH_SIZE)
for i, j, k in dev_dataloader:
    print(i['input_ids'].shape)  # i['input_ids'].shape=[batch_size, 1, seq_len]
    print(j['input_ids'].shape)
    print(k)  # k.shape=[batch_size]
    break

torch.Size([128, 1, 64])
torch.Size([128, 1, 64])
tensor([5, 4, 2, 2, 2, 5, 2, 5, 3, 1, 5, 4, 0, 2, 5, 4, 3, 1, 3, 2, 1, 1, 4, 1,
        2, 5, 5, 4, 3, 5, 5, 1, 4, 1, 3, 1, 0, 4, 4, 0, 4, 5, 0, 1, 0, 3, 0, 3,
        4, 2, 3, 3, 0, 4, 2, 4, 3, 1, 0, 4, 2, 3, 1, 2, 2, 0, 3, 5, 0, 2, 4, 2,
        3, 2, 1, 1, 0, 2, 0, 0, 3, 0, 0, 3, 2, 0, 3, 1, 0, 0, 0, 0, 0, 4, 3, 1,
        2, 0, 1, 2, 0, 1, 0, 3, 5, 5, 5, 3, 2, 3, 1, 5, 5, 2, 5, 1, 1, 3, 2, 1,
        4, 3, 4, 0, 1, 0, 4, 1])


In [8]:
class SimcseModel(nn.Module):
    """Simcse无监督模型,超参数参考见:https://kexue.fm/archives/8348"""
    
    def __init__(self, pretrained_model, pooling, dropout_ratio=0.1):
        super(SimcseModel, self).__init__()
        config = BertConfig.from_pretrained(pretrained_model)
        
        # 修改config的dropout参数
        config.attention_probs_dropout_prob = dropout_ratio
        config.hidden_dropout_prob = dropout_ratio          
        
        self.bert = AutoModel.from_pretrained(pretrained_model, config=config)
        self.pooling = pooling
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.bert(input_ids, attention_mask, token_type_ids, output_hidden_states=True)

        if self.pooling == 'cls':
            return out.last_hidden_state[:, 0]  # shape=[batch_size, hidden_size]
        
        if self.pooling == 'pooler':
            return out.pooler_output  # shape=[batch_size, hidden_size]
        
        if self.pooling == 'last-avg':
            last = out.last_hidden_state.transpose(1, 2) # shape=[batch_size, hidden_size, seq_len]
            return torch.avg_pool1d(last, kernel_size=last.shape[-1]).squeeze(-1)  # shape=[batch_size, hidden_size]
        
        if self.pooling == 'first-last-avg':
            first = out.hidden_states[1].transpose(1, 2)  # shape=[batch_size, hidden_size, seq_len]
            last = out.hidden_states[-1].transpose(1, 2)              
            first_avg = torch.avg_pool1d(first, kernel_size=last.shape[-1]).squeeze(-1)  # shape=[batch_size, hidden_size]
            last_avg = torch.avg_pool1d(last, kernel_size=last.shape[-1]).squeeze(-1)   # shape=[batch_size, hidden_size]
            avg = torch.cat((first_avg.unsqueeze(1), last_avg.unsqueeze(1)), dim=1)     # shape=[batch_size, 2, hidden_size]
            return torch.avg_pool1d(avg.transpose(1, 2), kernel_size=2).squeeze(-1)     # shape=[batch_size, hidden_size]

In [9]:
def simcse_unsup_loss(y_pred, temperature=0.05):
    """损失函数(无监督)"""
    # y_pred.shape=[BATCH_SIZE * 2, hidden_size]
    y_true = torch.arange(y_pred.shape[0], device=DEVICE)
    # example:[1, 0, 3, 2, 5, 4, 7, 6, xxxxxx]
    y_true = (y_true - y_true % 2 * 2) + 1
    # sim.shape=[BATCH_SIZE * 2, BATCH_SIZE * 2]
    sim = F.cosine_similarity(y_pred.unsqueeze(1), y_pred.unsqueeze(0), dim=-1)
    # 相似度矩阵对角线处元素设置为很小的值(消除自身影响)
    sim = sim - torch.eye(y_pred.shape[0], device=DEVICE) * 1e12
    # 相似度矩阵除以温度系数
    sim = sim / temperature
    loss = F.cross_entropy(sim, y_true)  # 理解:将同一句子得到的embeddings向量对作为正样本对,其他句子得到的embeddings向量对作为负样本
    return torch.mean(loss)

In [10]:
model = SimcseModel(pretrained_model='hfl/chinese-roberta-wwm-ext', pooling='cls', dropout_ratio=0.3)
model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# 模型验证
def eval(model, dataloader):
    model.eval()
    
    sim_tensor = torch.tensor([], device=DEVICE)
    label_array = np.array([])
    
    with torch.no_grad():
        for source, target, label in dataloader:
            # label.shape=[batch_size]
            # source_input_ids.shape=[batch_size, seq_len]
            source_input_ids = source['input_ids'].squeeze(1).to(DEVICE)
            source_attention_mask = source['attention_mask'].squeeze(1).to(DEVICE)
            source_token_type_ids = source['token_type_ids'].squeeze(1).to(DEVICE)
            # source_pred.shape=[batch_size, hidden_size]
            source_pred = model(source_input_ids, source_attention_mask, source_token_type_ids)
            
            # target_input_ids.shape=[batch_size, seq_len]
            target_input_ids = target['input_ids'].squeeze(1).to(DEVICE)
            target_attention_mask = target['attention_mask'].squeeze(1).to(DEVICE)
            target_token_type_ids = target['token_type_ids'].squeeze(1).to(DEVICE)
            # target_pred.shape=[batch_size, hidden_size]
            target_pred = model(target_input_ids, target_attention_mask, target_token_type_ids)
            # sim.shape=[batch_size]
            sim = F.cosine_similarity(source_pred, target_pred, dim=-1)  # result:是否相似
            sim_tensor = torch.cat((sim_tensor, sim), dim=0)
            label_array = np.append(label_array, np.array(label))  
    
    return spearmanr(label_array, sim_tensor.cpu().numpy()).correlation  # 斯皮尔曼相关系数(无序)

In [12]:
# 模型训练与评估
def train(model, train_dl, dev_dl, optimizer, best):
    model.train()
    
    early_stop_batch = 0
    for batch_idx, source in enumerate(train_dl, start=1):
        real_batch_num = source['input_ids'].shape[0]
        input_ids = source['input_ids'].view(real_batch_num * 2, -1).to(DEVICE)
        attention_mask = source['attention_mask'].view(real_batch_num * 2, -1).to(DEVICE)
        token_type_ids = source['token_type_ids'].view(real_batch_num * 2, -1).to(DEVICE)

        out = model(input_ids, attention_mask, token_type_ids)
        loss = simcse_unsup_loss(out)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx % 10 == 0:
            print('| step {:5d} | loss {:8.5f} |'.format(batch_idx, loss.item()))
            corrcoef = eval(model, dev_dl)
            model.train()
            if best[0] < corrcoef:
                best.clear()
                best.append(corrcoef)
                best.append(copy.deepcopy(model.state_dict()))

In [13]:
best = [0.0, None]  # 验证数据集最优相关系数与对应模型的状态字典
for epoch in range(2):
    print('*' * 20 + str(epoch) + '*' * 20)
    train(model, train_dataloader, dev_dataloader, optimizer, best)

********************0********************
| step    10 | loss  1.65872 |
| step    20 | loss  0.91445 |
| step    30 | loss  0.65556 |
| step    40 | loss  0.47245 |
| step    50 | loss  0.33608 |
| step    60 | loss  0.34597 |
| step    70 | loss  0.24724 |
| step    80 | loss  0.24938 |
| step    90 | loss  0.19374 |
| step   100 | loss  0.19065 |
| step   110 | loss  0.21806 |
| step   120 | loss  0.20168 |
| step   130 | loss  0.16607 |
| step   140 | loss  0.18110 |
| step   150 | loss  0.14478 |
| step   160 | loss  0.12090 |
| step   170 | loss  0.13810 |
| step   180 | loss  0.10538 |
| step   190 | loss  0.11358 |
| step   200 | loss  0.12241 |
| step   210 | loss  0.10192 |
| step   220 | loss  0.10163 |
| step   230 | loss  0.09620 |
| step   240 | loss  0.08635 |
| step   250 | loss  0.08812 |
| step   260 | loss  0.09918 |
| step   270 | loss  0.08029 |
| step   280 | loss  0.09616 |
| step   290 | loss  0.08034 |
| step   300 | loss  0.08143 |
| step   310 | loss  0.07615

In [14]:
best[0]

0.7457631647557355

In [15]:
test_dataloader = DataLoader(TestDataset(test_data), batch_size=BATCH_SIZE)

In [16]:
best_model = SimcseModel(pretrained_model='hfl/chinese-roberta-wwm-ext', pooling='cls')
best_model.load_state_dict(best[1])
best_model.to(DEVICE)

# 最优
dev_corrcoef_best = eval(best_model, dev_dataloader)
test_corrcoef_best = eval(best_model, test_dataloader)
print(f'dev_corrcoef: {dev_corrcoef_best:.8f}')
print(f'test_corrcoef: {test_corrcoef_best:.8f}')

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dev_corrcoef: 0.74576316
test_corrcoef: 0.69478349


In [17]:
dev_corrcoef_last = eval(model, dev_dataloader)
test_corrcoef_last = eval(model, test_dataloader)

# 未早停
print(f'dev_corrcoef: {dev_corrcoef_last:.8f}')
print(f'test_corrcoef: {test_corrcoef_last:.8f}')

dev_corrcoef: 0.71067608
test_corrcoef: 0.66283171


In [18]:
model_init = SimcseModel(pretrained_model='hfl/chinese-roberta-wwm-ext', pooling='cls')
model_init.to(DEVICE)
dev_corrcoef_init = eval(model_init, dev_dataloader)
test_corrcoef_init = eval(model_init, test_dataloader)

# 未训练
print(f'dev_corrcoef: {dev_corrcoef_init:.8f}')
print(f'test_corrcoef: {test_corrcoef_init:.8f}')

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dev_corrcoef: 0.71675741
test_corrcoef: 0.68251387
