In [1]:
import pandas as pd
import numpy as np
import gc
from tqdm.auto import tqdm
import pickle5
from copy import deepcopy

In [2]:
# basic random seed
import os 
import random
import numpy as np 

def seedBasic(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
# tensorflow random seed 
import tensorflow as tf 
def seedTF(seed=42):
    tf.random.set_seed(seed)
    
# torch random seed
import torch
def seedTorch(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
      
# basic + tensorflow + torch 
def seedEverything(seed=42):
    seedBasic(seed)
    seedTF(seed)
    seedTorch(seed)

In [3]:
from nltk import ngrams, word_tokenize, sent_tokenize
from nltk.stem.porter import *
from bs4 import BeautifulSoup
import re
import spacy

nlp = spacy.blank("en")

def clean_text(tmp):
    soup = BeautifulSoup(tmp)
    text = soup.get_text(separator=" ").strip()
    text = re.sub(r'\n\s*\n', '\n', text)
    text = re.sub(r'\t\s*\t', ' ', text)
    text = re.sub(r'\xa0', ' ', text)
#     text = nlp(text)
    return text

In [4]:
global input_dir, output_dir, model_name, model_type, criteria, description, test_project, data_pid, title

input_dir = './input_data/'
output_dir = './output_models_v6/'

model_name = 'roberta-large'
model_type = 'roberta_v6'

with open(input_dir + 'data_criteria_v6.pk5', 'rb') as f:
    criteria = pickle5.load(f)
with open(input_dir + 'data_description_v6.pk5', 'rb') as f:
    description = pickle5.load(f)
    
import pickle
with open('./evaluation/test_label.pkl', 'rb') as f:
    test_label = pickle.load(f)
test_project = list(test_label.keys())    

criteria = {k: x for k, x in criteria.items() if k not in test_project}
description = {k: x for k, x in description.items() if k not in test_project}

data_pid = np.unique(list(criteria.keys()) + list(description.keys()))

In [5]:
title_data = pd.read_csv(input_dir + 'project_entity.csv', lineterminator='\n', usecols = [0,1], header = None)
title = {}
for i in range(len(title_data)):
    p = title_data.iloc[i, 0]
    t = title_data.iloc[i, 1]
    if (t!=t)|(t is None):
        t = ''
    title[p] = t
    
for k,v in criteria.items():
    if len(v[0]) > 0:
        criteria[k] = [v[0] + [title[k]], v[1]]
    else:
        criteria[k] = [[''], []]
for k,v in description.items():
    if len(v[0]) > 0:
        description[k] = [v[0] + [title[k]], v[1]]
    else:
        description[k] = [[''], []]

In [6]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from adamp import SGDP,AdamP

global tokenizer, device
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space = True)
device = torch.device('cuda')

In [7]:
class RelationData(Dataset):
    def __init__(self, select_pid):
        self.select_pid = select_pid
        self.null = [[''], []]

    def __len__(self):
        return len(self.select_pid)
    
    def mark_text(self, x):
        text = x[0]
        mark = x[1]
        st_marks = np.array([x[1] for x in mark])
        ed_marks = np.array([x[2] for x in mark])
        scores = np.array([x[-1] for x in mark])
        ed_marks = ed_marks[np.argsort(st_marks)]
        scores = scores[np.argsort(st_marks)]
        st_marks = np.sort(st_marks)

        marked_text = []
        prev_ed = 0
        for (st, ed) in zip(st_marks, ed_marks):
            marked_text += text[prev_ed:st] + [tokenizer.mask_token] + \
                            text[st:ed] + [tokenizer.mask_token]
            prev_ed = ed
        marked_text += text[prev_ed:]

        inputs = tokenizer(marked_text, 
                           padding = True,
                           is_split_into_words = True,
                           truncation = True,
                           return_tensors = 'pt')

        n =  len(inputs['input_ids'][0])
        end_label = torch.zeros((1, n))
        start_label = torch.zeros((1, n))

        label_position = np.where(inputs['input_ids'].numpy() == tokenizer.mask_token_id)[1]

        if len(label_position) > 0:
            if len(label_position) % 2 == 1:
                label_position = label_position[:-1]
            for k, i in enumerate(np.arange(0, len(label_position), 2)):
                st = label_position[i]
                ed = label_position[i + 1]
                end_label[:, (ed - 1)] = float(scores[k] > 0.9)
                start_label[:, (st + 1)] = float(scores[k] > 0.9)

        mark_mask = inputs['input_ids'] != tokenizer.mask_token_id

        start_label = start_label[mark_mask]
        end_label = end_label[mark_mask]
        inputs = {k: v[mark_mask] for k,v in inputs.items()}
        
        return inputs, start_label, end_label

    def __getitem__(self, idx):
        pid = self.select_pid[idx]
        sample_c, sample_d = criteria.get(pid), description.get(pid)
        sample_c = sample_c if sample_c is not None else self.null
        sample_d = sample_d if sample_d is not None else self.null
        
        c_inputs, c_st, c_ed = self.mark_text(sample_c)
        d_inputs, d_st, d_ed = self.mark_text(sample_d)
        
        return [c_inputs, c_st, c_ed], [d_inputs, d_st, d_ed]
    
def padding(batch):
    lengths = [len(x[-1]) for x in batch]
    max_len = max(lengths)
    batch_input_ids, batch_attention_mask, batch_label_st, batch_label_ed = [], [], [], []

    batch_input_ids = torch.cat([torch.cat([x[0]['input_ids'], 
                       torch.LongTensor([tokenizer.pad_token_id]*(max_len - l))], 0).unsqueeze(0) for x, l in zip(batch, lengths)], 0)
    batch_attention_mask = torch.cat([torch.cat([x[0]['attention_mask'], 
                       torch.Tensor([0]*(max_len - l))], 0).unsqueeze(0) for x, l in zip(batch, lengths)], 0)
    batch_label_st = torch.cat([torch.cat([x[1], 
                       torch.LongTensor([0]*(max_len - l))], 0).unsqueeze(0) for x, l in zip(batch, lengths)], 0)
    batch_label_ed = torch.cat([torch.cat([x[2], 
                       torch.LongTensor([0]*(max_len - l))], 0).unsqueeze(0) for x, l in zip(batch, lengths)], 0)
    return {'input_ids': batch_input_ids, 'attention_mask': batch_attention_mask}, batch_label_st, batch_label_ed

def collate_function(samples):
    c_inputs, c_st, c_ed = padding([x[0] for x in samples])
    d_inputs, d_st, d_ed = padding([x[1] for x in samples])
    return [c_inputs, c_st, c_ed], [d_inputs, d_st, d_ed]

In [8]:
class relation_model(nn.Module):
    def __init__(self, dims = 512, drop_rate = 0):
        super().__init__()
        self.LM = AutoModel.from_pretrained(model_name, 
                                           attention_probs_dropout_prob = 0,
                                           hidden_dropout_prob = 0,
                                           output_hidden_states = True)
#         self.LM.encoder.layer = nn.ModuleList([x for i, x in enumerate(self.LM.encoder.layer) if i < 22])
#         torch.cuda.empty_cache()
        
        self.dims = dims
        self.section_embedding = nn.Embedding(2, self.dims)
        self.section_layer = nn.Linear(self.LM.config.hidden_size, self.dims, bias = False)
        self.summary_layer = nn.Sequential(nn.Mish(), nn.LayerNorm(self.dims))      
        
        self.summary_encoder = nn.ModuleList()
        for i in range(2):
            self.summary_encoder.append(nn.TransformerEncoderLayer(d_model = self.dims, nhead = 8,
                                                                      dim_feedforward = self.dims*4,
                                                                      dropout = drop_rate,
                                                                      batch_first = True))     
        

        self.output_layer = nn.Sequential(nn.Linear(self.dims, 2), nn.Sigmoid())
    
    def forward(self, c_in, d_in):
        hidden_c = self.LM(**c_in).last_hidden_state
        hidden_d = self.LM(**d_in).last_hidden_state
        hidden = torch.cat([hidden_c, hidden_d], 1)

        section = torch.LongTensor([0]*hidden_c.shape[1] + [1]*hidden_d.shape[1]).to(hidden.device)
        hidden = self.summary_layer(self.section_layer(hidden) + self.section_embedding(section))
        
        mask = 1 - torch.cat([c_in['attention_mask'], d_in['attention_mask']], 1)
        for i in range(2):
            hidden = self.summary_encoder[i](hidden, src_key_padding_mask = mask.bool())

        probs = self.output_layer(hidden)
        return probs

In [9]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

def train_one_epoch(alpha = 1):
    model.train()
    running = []
    for i, (c_inputs, d_inputs) in enumerate(tqdm(tr_loader)):
        optimizer.zero_grad()

        c_in = {k:v.to(device) for k,v in c_inputs[0].items()}
        c_st, c_ed = c_inputs[1].to(device), c_inputs[2].to(device)
        d_in = {k:v.to(device) for k,v in d_inputs[0].items()}
        d_st, d_ed = d_inputs[1].to(device), d_inputs[2].to(device)
        st_label = torch.cat([c_st, d_st], 1)
        ed_label = torch.cat([c_ed, d_ed], 1)
        mask = torch.cat([c_in['attention_mask'], d_in['attention_mask']], 1)

        probs = model(c_in, d_in)
        loss1 = -(st_label*probs[:, :, 0].log() + (1 - st_label)*(1 - probs[:, :, 0]).log())
        loss2 = -(ed_label*probs[:, :, 1].log() + (1 - ed_label)*(1 - probs[:, :, 1]).log())
        w1 = alpha*st_label + (1 - st_label)
        w2 = alpha*ed_label + (1 - ed_label)
        loss1 = (loss1*mask*w1).sum()/(mask*w1).sum()
        loss2 = (loss2*mask*w2).sum()/(mask*w2).sum()
        loss = 0.5*loss1 + 0.5*loss2
        
        loss.backward()
        optimizer.step()        
        running.append(loss.item())
        
        if (i % 1000 == 0)&(i > 0):
            print(np.mean(running))
    
    loss_tr = np.mean(running)
    return loss_tr

def thres_search(y, p):
    auc = roc_auc_score(y, p)
    f1_table = []
    thres_range = np.arange(0.1, 0.9, 0.02)
    for thres in thres_range:
        f1 = f1_score(y, p > thres)
        f1_table.append([thres, f1])
    f1_table = np.array(f1_table)
    best_thres = thres_range[f1_table[:, -1].argmax(-1)]
    f1 = f1_score(y, p > best_thres)
    precision = precision_score(y, p > best_thres)
    recall = recall_score(y, p > best_thres)
    return best_thres, [auc, f1, precision ,recall]

def validation(alpha = 1):
    model.eval()
    probs_val = []
    loss_val = []
    st_label_val = []
    ed_label_val = []
    for i, (c_inputs, d_inputs) in enumerate(tqdm(val_loader)):

        c_in = {k:v.to(device) for k,v in c_inputs[0].items()}
        c_st, c_ed = c_inputs[1].to(device), c_inputs[2].to(device)
        d_in = {k:v.to(device) for k,v in d_inputs[0].items()}
        d_st, d_ed = d_inputs[1].to(device), d_inputs[2].to(device)
        st_label = torch.cat([c_st, d_st], 1)
        ed_label = torch.cat([c_ed, d_ed], 1)
        mask = torch.cat([c_in['attention_mask'], d_in['attention_mask']], 1)

        with torch.no_grad():
            probs = model(c_in, d_in)

        loss1 = -(st_label*probs[:, :, 0].log() + (1 - st_label)*(1 - probs[:, :, 0]).log())
        loss2 = -(ed_label*probs[:, :, 1].log() + (1 - ed_label)*(1 - probs[:, :, 1]).log())
        w1 = alpha*st_label + (1 - st_label)
        w2 = alpha*ed_label + (1 - ed_label)
        loss1 = (loss1*mask*w1).sum()/(mask*w1).sum()
        loss2 = (loss2*mask*w2).sum()/(mask*w2).sum()
        loss = 0.5*loss1 + 0.5*loss2
        loss_val.append(loss.cpu())
        
        probs_val.append(probs[mask == 1].cpu())
        st_label_val.append(st_label[mask == 1].cpu())
        ed_label_val.append(ed_label[mask == 1].cpu())

    probs_val = torch.cat(probs_val,0)
    st_label_val = torch.cat(st_label_val,0)
    ed_label_val = torch.cat(ed_label_val,0)
    loss_val = np.mean(loss_val)

    st_thres, st_metrics = thres_search(st_label_val, probs_val[:, 0])
    ed_thres, ed_metrics = thres_search(ed_label_val, probs_val[:, 1])
    return loss_val, [st_metrics, ed_metrics], [st_thres, ed_thres]

In [None]:
f = 2
val_pid = np.load(output_dir + f'val_pid_{f}.npy')
tr_pid = np.load(output_dir + f'tr_pid_{f}.npy')

print(f'Training size: {len(tr_id)}, validation size: {len(val_pid)}')

tr_data = RelationData(tr_pid)
val_data = RelationData(val_pid)
tr_loader = DataLoader(tr_data, batch_size = 2, shuffle = True, collate_fn = collate_function)
val_loader = DataLoader(val_data, batch_size = 8, shuffle = False, collate_fn = collate_function)


seedEverything(616)

model = relation_model().to(device)
optimizer = AdamP([{'params': [w for name, w in model.named_parameters() if ('LM' not in name)], 
                    'lr': 5e-5},
                   {'params': [w for name, w in model.named_parameters() if ('LM' in name)], 
                    'lr': 5e-6}],
                  betas=(0.9, 0.999), weight_decay=1e-1)

tolerance = 0
best_f1 = 0
metrics_log = {}
for i in range(5):
    loss_tr = train_one_epoch()
    loss_val, metrics, thres = validation()
    f1 = np.mean([x[1] for x in metrics])
    print(loss_tr, loss_val, np.array(metrics).mean(0))
    metrics_log[i] = [loss_tr, loss_val, np.array(metrics).mean(0).tolist(), thres]

    with open(output_dir + f'training_log_{f}.pk5', 'wb') as f:
        pickle5.dump(metrics_log, f)

    if best_f1 < f1:
        best_f1 = deepcopy(f1)
        best_metrics = deepcopy(metrics)
        best_model = deepcopy(model)
        torch.save(best_model.state_dict(), output_dir + f'{model_type}_{f}.pt')
        tolerance = 0
    else:
        tolerance += 1

    if tolerance > 2:
        break

Training size: 68046, validation size: 9721


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/34023 [00:00<?, ?it/s]

0.028544612341680445
