In [1]:
import torch
import torch.nn as nn
from os.path import exists
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from collections import Counter

In [2]:
#데이터 형식: (스페셜 토큰 들어간 문장, 라벨)
#데이터 처리:
#sent 문장에서 Pointer를 이용해 문장에 Marked 토큰 추가
#모델 Interence 후 , 각 Class 에 대해 sigmoid 로 스코어 측정. Ground truth는 Pointer에게 있음.

In [3]:
GPU_NUM = 1#

device = (torch.device(f'cuda:{GPU_NUM}') if torch.cuda.is_available() else torch.device('cpu'))
torch.cuda.set_device(device)
#device = torch.device('cpu')#cpu

print("Current Device is: ", device)

Current Device is:  cuda:1


In [4]:
PATH_FOLDER = './dataset/relation_extraction/FinRED/'

In [5]:
with open(PATH_FOLDER + 'relations.txt') as relationfile:
    RELATION = relationfile.read()
RELATION = RELATION.replace(' ', '_').replace('product/material_produced','product_or_material_produced').replace('director/manager','director_/_manager').split('\n')
RELATION = {k: i for i, k in enumerate(RELATION)}
SPECIAL_TOKENS = ['<Sub>', '</Sub>', '<Obj>', '</Obj>']#index 50265~50268

In [6]:
RELATION

{'product_or_material_produced': 0,
 'manufacturer': 1,
 'distributed_by': 2,
 'industry': 3,
 'position_held': 4,
 'original_broadcaster': 5,
 'owned_by': 6,
 'founded_by': 7,
 'distribution_format': 8,
 'headquarters_location': 9,
 'stock_exchange': 10,
 'currency': 11,
 'parent_organization': 12,
 'chief_executive_officer': 13,
 'director_/_manager': 14,
 'owner_of': 15,
 'operator': 16,
 'member_of': 17,
 'employer': 18,
 'chairperson': 19,
 'platform': 20,
 'subsidiary': 21,
 'legal_form': 22,
 'publisher': 23,
 'developer': 24,
 'brand': 25,
 'business_division': 26,
 'location_of_formation': 27,
 'creator': 28,
 '': 29}

In [7]:
class FinRED(Dataset):
    def __init__(self, train = True):
        #transforms.Compose([transforms.ToTensor(),lambda x: x.repeat(1,3,1,1), lambda x: torch.mul(x,torch.rand(3,28,28))])
        self.train = train
        self.data = []
        if self.train == True:
            setname = 'train'
        else:
            setname = 'test'

        with open(PATH_FOLDER + setname + '.pointer') as pointer_file:
            pointer = pointer_file.read().strip()
            pointer = pointer.split('\n')
            
        with open(PATH_FOLDER + setname + '.sent') as sent_file:
            sent = sent_file.read().strip()
            sent = sent.split('\n')
        lengths = list(map(lambda x: len(x), sent))
        print(len(sent))
        for i, length in enumerate(lengths):
            if length>1000:
                pointer[i] = "todel"
                sent [i] ="todel"
        pointer = [i for i in pointer if i !="todel"]
        sent = [i for i in sent if i !="todel"]
        print(len(sent))
        for a, b  in zip(sent, pointer):#한 sentence에 대해
            tuples = list(map(lambda x: x.strip(), b.split('|')))
            tuple_dict = {}
            tuple_keys = list(map(lambda x: ' '.join(x.split(' ')[:4]), tuples))
            tuple_values = list(map(lambda x: x.split(' ')[4], tuples))
            for tuple_, category in zip(tuple_keys, tuple_values):
                if tuple_ not in tuple_dict:
                    tuple_dict[tuple_] = [category]
                else:
                    tuple_dict[tuple_] += [category]
            for tuple_, category in tuple_dict.items(): #튜플 하나
                sent_splitted = a.split(' ')
                splitted = tuple_.split(' ')
                token_index = list(map(int,splitted))
                token_index = [x+y for x,y in zip(token_index, [0,1,0,1])]#공간 마련
                index_map = zip(token_index,SPECIAL_TOKENS)
                index_map = sorted(index_map, reverse =True)
                for index, token in index_map:
                    sent_splitted.insert(index, token)
                input_string = ' '.join(sent_splitted)
                label = list(map(lambda x: RELATION[x],category))#list of indexes
                one_hot = torch.zeros(len(RELATION))
                one_hot[[label]] = 1
                self.data.append((input_string, one_hot))
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return self.data[index][0] , self.data[index][1]

In [8]:
class FinRED_dev(Dataset):
    def __init__(self, train = True):
        #transforms.Compose([transforms.ToTensor(),lambda x: x.repeat(1,3,1,1), lambda x: torch.mul(x,torch.rand(3,28,28))])
        self.train = train
        self.data = []
        if self.train == True:
            setname = 'dev'
        else:
            setname = 'dev'

        with open(PATH_FOLDER + setname + '.pointer') as pointer_file:
            pointer = pointer_file.read().strip()
            pointer = pointer.split('\n')
            
        with open(PATH_FOLDER + setname + '.sent') as sent_file:
            sent = sent_file.read().strip()
            sent = sent.split('\n')
        lengths = list(map(lambda x: len(x), sent))
        print(len(sent))
        for i, length in enumerate(lengths):
            if length>1000:
                pointer[i] = "todel"
                sent [i] ="todel"
        pointer = [i for i in pointer if i !="todel"]
        sent = [i for i in sent if i !="todel"]
        print(len(sent))
        for a, b  in zip(sent, pointer):#한 sentence에 대해
            tuples = list(map(lambda x: x.strip(), b.split('|')))
            tuple_dict = {}
            tuple_keys = list(map(lambda x: ' '.join(x.split(' ')[:4]), tuples))
            tuple_values = list(map(lambda x: x.split(' ')[4], tuples))
            for tuple_, category in zip(tuple_keys, tuple_values):
                if tuple_ not in tuple_dict:
                    tuple_dict[tuple_] = [category]
                else:
                    tuple_dict[tuple_] += [category]
            for tuple_, category in tuple_dict.items(): #튜플 하나
                sent_splitted = a.split(' ')
                splitted = tuple_.split(' ')
                token_index = list(map(int,splitted))
                token_index = [x+y for x,y in zip(token_index, [0,1,0,1])]#공간 마련
                index_map = zip(token_index,SPECIAL_TOKENS)
                index_map = sorted(index_map, reverse =True)
                for index, token in index_map:
                    sent_splitted.insert(index, token)
                input_string = ' '.join(sent_splitted)
                label = list(map(lambda x: RELATION[x],category))#list of indexes
                one_hot = torch.zeros(len(RELATION))
                one_hot[[label]] = 1
                self.data.append((input_string, one_hot))
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return self.data[index][0] , self.data[index][1]

In [9]:
class RE(nn.Module):
    def __init__(self):
        super(RE, self).__init__()
        self.modelname = "FinRED ver 1.0"
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.pretrained = RobertaModel.from_pretrained('roberta-base').to(device=device)
        #==========토큰 추가===========================
        special_tokens_dict = {'additional_special_tokens': SPECIAL_TOKENS}
        self.tokenizer.add_special_tokens(special_tokens_dict)
        self.pretrained.resize_token_embeddings(len(self.tokenizer))
        self.header = nn.Linear(768*2,len(RELATION)).to(device=device)
    def forward(self, x):#x: 문제 input
        batchsize = len(x)
        encoded_input = self.tokenizer(x, return_tensors='pt', padding='max_length', truncation=True).to(device=device)
        nongpu_input = self.tokenizer(x)['input_ids']
        start_list = [list(map(lambda x: x.index(50265),nongpu_input)), list(map(lambda x: x.index(50267),nongpu_input))]
        #start_list = [list(map(lambda x: x.index(50265),encoded_input['input_ids'])), list(map(lambda x: x.index(50267),encoded_input['input_ids']))]
        output = self.pretrained(**encoded_input)[0]#batchsize*sequence maximum size*embedding
        input_linear = torch.cat((output[range(batchsize),start_list[0]], output[range(batchsize),start_list[1]]), dim = -1)
        final_output = self.header(input_linear)
        return final_output
        

In [10]:
model = RE()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
def training_loop(optimizer, model, loss_fn, train_loader, test_loader, epochs, mini_per_batch):
    for epoch in range(1,epochs+1):
        model.train()
        loss_train_epoch = 0.0
        loss_batch = 0.0
        mb_passed = 0
        optimizer.zero_grad()#매 epoch 시작마다 Grad 초기화
        for sentences, labels in train_loader:
            labels= labels.to(device=device)
            outputs = model(sentences)
            loss = loss_fn(outputs, labels)
            loss.backward()
            loss_train_epoch += loss.item()
            loss_batch += loss.item()
            mb_passed += 1
            if mb_passed%mini_per_batch == 0:
                optimizer.step()
                optimizer.zero_grad()
                loss_batch=loss_batch/mini_per_batch
                print("epoch:",epoch, " minibatch train_loss: ", loss_batch)
                loss_batch = 0.0
                mb_passed = 0
        loss_train_epoch = loss_train_epoch/len(train_loader)
        loss_validation, _, _, F1 = evaluate_model(model, test_loader, loss_fn)
        print("epoch:",epoch, " train_loss: ", loss_train_epoch, " val_loss: ", loss_validation, " val_F1: ", F1)
        torch.save(model.state_dict(), model.modelname+'.pt')#debug

        

        
def accuracy_metrics(tensor, target):
    tensor = tensor == 1
    #print(torch.sum(tensor))
    target = target == 1
    logical_not = torch.logical_not
    def andsum(tensor1, tensor2):
        return torch.sum(torch.logical_and(tensor1,tensor2)).item()
    TP = andsum(tensor, target)
    TN = andsum(logical_not(tensor), logical_not(target))
    FP = andsum(tensor, logical_not(target))
    FN = andsum(logical_not(tensor), target)

    return torch.tensor([TP, TN, FP, FN], dtype = torch.long)
        
def evaluate_model(model, data_loader, loss_fn):
    model.eval()
    accus = torch.zeros(4, dtype = torch.long)#TP,PN,FP,FN
    loss_test = 0.0
    sigmoid = nn.Sigmoid()
    with torch.no_grad():
        for sentences, labels in data_loader:
            labels = labels.to(device=device)#batchsize*30 one_hot
            outputs = model(sentences)
            output_labels = (sigmoid(outputs)>0.5).long()
            accus += accuracy_metrics(output_labels, labels.long())
            #print(accus)
            loss = loss_fn(outputs, labels)
            loss_test += loss.item()
    loss_test = loss_test/len(data_loader)
    precision = accus[0]/(accus[0]+accus[2])#TP/(TP+FP)
    recall = accus[0]/(accus[0]+accus[3])#TP/(TP+FN)
    F1 = 2*(precision*recall)/(precision+recall)
    model.train()
    return loss_test, precision, recall,  F1


In [None]:
EPOCHES = 100
BATCHSIZE = 1900 #주의: BATCHSIZE_MINI의 배수로 설정하시오
BATCHSIZE_MINI = 25
MINI_PER_BATCH = int(BATCHSIZE/BATCHSIZE_MINI)
LR = 4e-6
WEIGHT_DECAY = 4e-5

print(model)
filename_parameters = model.modelname+'.pt'
if exists(filename_parameters):
    print(filename_parameters, "exists.")
    print(model.load_state_dict(torch.load("./" + filename_parameters)))
else:
    print(filename_parameters, "does not exists.")
    
train_loader = DataLoader(FinRED(train=True), batch_size = BATCHSIZE_MINI, shuffle = True)
test_loader = DataLoader(FinRED(train=False), batch_size = 20, shuffle =True)

loss_fn = nn.MultiLabelSoftMarginLoss()#reduction:mean. reduction= None 일 경우에는 주어진 batchsize짜리 vector
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay = WEIGHT_DECAY)
training_loop(optimizer, model, loss_fn, train_loader, test_loader, EPOCHES, MINI_PER_BATCH)

RE(
  (pretrained): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50269, 768)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, element

<All keys matched successfully>
5700
5647
1068
1060
epoch: 1  minibatch train_loss:  0.1428983944811319
epoch: 1  minibatch train_loss:  0.14236399521561047


In [None]:
print(evaluate_model(model, test_loader, loss_fn))

In [None]:
dev_loader = DataLoader(FinRED_dev(train=False), batch_size = 20, shuffle =True)
print(evaluate_model(model, dev_loader, loss_fn))

epochs:13


In [None]:
ps = [i for i in model.header.parameters()]

In [None]:
pse = [i for i in model.pretrained.embeddings.word_embeddings.parameters()][0]

In [None]:
ps

In [None]:
ps[1].std()