In [1]:
import pandas as pd
import torch
torch.cuda.is_available()
import json
from transformers import RobertaTokenizer,RobertaModel,RobertaConfig,AutoTokenizer,AutoModelForMaskedLM,AdamW,get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
#from datasets import load_dataset, load_metric
import copy
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

In [2]:
# kaggle = pd.read_json('./kaggle_data/arxiv-metadata-oai-snapshot.json',lines=True)
# #把有n个学科标签的文章拆成n行
# data = kaggle.join(kaggle['categories'].str.split(' ', expand=True).stack().reset_index(level=1, drop=True).rename('label'))
# #有些学科标签最前面是'.',有的是'-'。拆开取最前面的学科为一级学科
# def mapping(x):
#     if '.' in x:
#         return x.split('.')[0]
#     else:
#         return x.split('-')[0]
# data['label'] = data['label'].apply(mapping)
# data['content'] = data['title'] + ':' +data['abstract'] 
# data.to_pickle('./kaggle_data/data.pkl')


In [3]:
# data.head()

In [4]:
# data_ = data[['id','content','label']]

## 构建训练集

In [5]:
# train_data = pd.DataFrame()
# data_ = pd.read_pickle('./kaggle_data/data_.pkl')
# data_=data_.reset_index(drop=True)
# index1 = np.random.randint(0,2923911,100000)
# index2 = np.random.randint(0,2923911,100000)

In [6]:
# from collections import defaultdict
# dic = defaultdict(list)
# for id in range(100000):
#     dic['id'].append(id)
#     dic['text1'].append(data_.loc[index1[id],'content'])
#     dic['text2'].append(data_.loc[index2[id],'content'])
#     dic['label1'].append(data_.loc[index1[id],'label'])
#     dic['label2'].append(data_.loc[index2[id],'label'])
#     if dic['label1'][-1] == dic['label2'][-1]:
#         dic['label'].append(1)
#     else:
#         dic['label'].append(0)
# train_data = pd.DataFrame.from_dict(dic)
# train_data.to_pickle('./kaggle_data/train_data.pkl')

In [7]:
# from collections import Counter
# d = Counter(dic['label'])
# train_data.head()

In [8]:
train_data = pd.DataFrame()
train_data = pd.read_pickle('./kaggle_data/train_data.pkl')
len(train_data)

100000

## 构建验证集

In [9]:
# val_data = pd.DataFrame()
# data_ = pd.read_pickle('./kaggle_data/data_.pkl')
# data_=data_.reset_index(drop=True)
# index1 = np.random.randint(0,2923911,20000)
# index2 = np.random.randint(0,2923911,20000)

In [10]:
# dic = defaultdict(list)
# for id in range(len(index1)):
#     dic['id'].append(id)
#     dic['text1'].append(data_.loc[index1[id],'content'])
#     dic['text2'].append(data_.loc[index2[id],'content'])
#     dic['label1'].append(data_.loc[index1[id],'label'])
#     dic['label2'].append(data_.loc[index2[id],'label'])
#     if dic['label1'][-1] == dic['label2'][-1]:
#         dic['label'].append(1)
#     else:
#         dic['label'].append(0)
# val_data = pd.DataFrame.from_dict(dic)
# val_data.to_pickle('./kaggle_data/val_data.pkl')

In [11]:
val_data = pd.DataFrame()
val_data = pd.read_pickle('./kaggle_data/val_data.pkl')
len(val_data)

20000

## 模型

In [12]:
#roberta large
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaModel.from_pretrained('roberta-large')
config = RobertaConfig.from_pretrained('roberta-large')
config.gradient_checkpointing = True

In [13]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='roberta-large'):

        self.data = data  
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-large') 

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        
        sent1 = str(self.data.loc[index, 'text1'])
        sent2 = str(self.data.loc[index, 'text2'])

        #要查一下，这输入两个句子，截断是怎么截断的
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length', 
                                      truncation=True,  
                                      max_length=self.maxlen,  
                                      return_tensors='pt',return_token_type_ids=True)  
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0) 

        if self.with_labels: 
            label = self.data.loc[index, 'label']
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

In [14]:
class SentenceMatch(torch.nn.Module):
    def __init__(self,bert_model = 'roberta-large'):
        super(SentenceMatch,self).__init__()
        self.bert = RobertaModel.from_pretrained('roberta-large')
        self.dnn = torch.nn.Linear(1024,1)
        self.dropout = torch.nn.Dropout(p=0.1)
    
    def forward(self,input_ids,attn_masks,token_type_ids):
        hidden_state,cl = self.bert(input_ids,attn_masks,token_type_ids)
        prob = self.dnn(self.dropout(cl))
        return prob,cl

## 训练

In [15]:
def evaluate_loss(net, device, criterion, dataloader):
    net.eval()

    mean_loss = 0
    count = 0

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits,cl = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            count += 1

    return mean_loss / count

In [16]:
def train(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate):
    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    #一个epoch会打印5次loss
    print_every = nb_iterations // 5  
    iters = []
    train_losses = []
    val_losses = []

    scaler = GradScaler()

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):

            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)

            
            with autocast():
                
                logits, cl = net(seq, attn_masks, token_type_ids)

                loss = criterion(logits.squeeze(-1), labels.float())
                loss = loss / iters_to_accumulate  

            scaler.scale(loss).backward()
            #梯度累加
            if (it + 1) % iters_to_accumulate == 0:
                #如果没问题(比如loss变成nan)这个会调用opti.step()
                scaler.step(opti)
                scaler.update()
                lr_scheduler.step()
                opti.zero_grad()

            running_loss += loss.item()

            if (it + 1) % print_every == 0:  
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                        .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_loss = evaluate_loss(net, device, criterion, val_loader)  
        print()
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))

        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  
            best_loss = val_loss
            best_ep = ep + 1


    path_to_model='models/{}_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()

In [17]:
bert_model = "roberta-large" 
maxlen = 512
batch_size = 4
iters_to_accumulate = 2
lr = 2e-5
epochs = 3

In [18]:
print("Reading training data...")
train_set = CustomDataset(train_data, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(val_data, maxlen, bert_model)
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=5)
val_loader = DataLoader(val_set, batch_size=batch_size, num_workers=5)

Reading training data...
Reading validation data...


In [19]:
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cuda:0"
net = SentenceMatch(bert_model)

In [20]:
net.to(device)
criterion = torch.nn.BCEWithLogitsLoss()
opti = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
num_warmup_steps = 0 
num_training_steps = epochs * len(train_loader)  
t_total = (len(train_loader) // iters_to_accumulate) * epochs  
lr_scheduler = get_linear_schedule_with_warmup(optimizer=opti, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

# train(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

In [None]:
train(net, criterion, opti, lr, lr_scheduler, train_loader, val_loader, epochs, iters_to_accumulate)

In [71]:
torch.save(net,'./model/match/robert_11.11')

## 输出embedding

In [22]:
# encoded_pair = tokenizer('Sparsity-certifying Graph Decompositions','Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',return_token_type_ids=True,padding='max_length', 
#                                       truncation=True,  
#                                       max_length=128,  
#                                       return_tensors='pt')
# token_ids = encoded_pair['input_ids'].squeeze(0).cuda()  
# attn_masks = encoded_pair['attention_mask'].squeeze(0).cuda()  
# token_type_ids = encoded_pair['token_type_ids'].squeeze(0).cuda()
# #第一维不是batch大小就不对，就很奇怪，我实例化网络的时候又没有这么要求
# s = net(token_ids,attn_masks,token_type_ids)  

In [23]:
# print(net)

In [24]:
#  弄个batch
class EmbeddingDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=False, bert_model='roberta-large'):

        self.data = data  
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-large') 

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        
        sent1 = str(self.data.loc[index, 'content'])
        #sent2 = str(self.data.loc[index, 'text2'])

        
        encoded_pair = self.tokenizer(sent1, 
                                      padding='max_length', 
                                      truncation=True,  
                                      max_length=self.maxlen,  
                                      return_tensors='pt',return_token_type_ids=True)  
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0) 

        # 是我对False有什么误解吗？为什么带着就会报错，缺少label
        # if self.with_labels: 
        #     label = self.data.loc[index, 'label']
        #     return token_ids, attn_masks, token_type_ids, label  
        # else:
        return token_ids, attn_masks, token_type_ids

In [25]:
data = pd.read_json('./kaggle_data/arxiv-metadata-oai-snapshot.json',lines=True)
data['content'] = data['title'] + ':' +data['abstract']
test_data = data[['id','content']] 

In [26]:
embeddings = []
test_set = EmbeddingDataset(test_data, maxlen, bert_model)
# 这没shuffle吧？看看重新创建一个模型，读取权重，还是一个一个读比较放心
test_loader = DataLoader(test_set, batch_size=batch_size, num_workers=5)
for it, (seq, attn_masks, token_type_ids) in enumerate(tqdm(test_loader)):
    seq, attn_masks, token_type_ids= \
        seq.to(device), attn_masks.to(device), token_type_ids.to(device) 
    pro,cl = net(seq,attn_masks,token_type_ids)
    embeddings.append(cl.detach().cpu().numpy())
    if it % 10000 == 0:
        print(it)

  0%|          | 3/446570 [00:00<63:27:49,  1.95it/s]0
  0%|          | 1575/446570 [00:58<4:37:35, 26.72it/s]


KeyboardInterrupt: 

In [27]:
np.save('./kaggle_data/embeddings_with_class.npy',embeddings)