In [None]:
import numpy as np
import pandas as pd
import re
import torch
from torch import nn
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import get_cosine_schedule_with_warmup
from torch.cuda import amp
from tqdm.notebook import tqdm

## Preprocessing part
The dataset provided here is already labeled the concept, so what we need to do here: classification 

Processing step:
1. read all the content in
2. divide into train

In [None]:
def read_dataset(path):
    with open(path, "r") as f:
        lines = f.readlines()
        content_ls = []
        label_ls_d = []
        comment_ls = []
        for ids in range(0,len(lines),4):
            content_ls.append(lines[ids])
            label_ls_d.append(lines[ids+1].split("\n")[0])
            comment_ls.append(re.sub("Comment:","",lines[ids+2].split("\n")[0]))
        assert len(label_ls_d) == len(content_ls) == len(comment_ls)
        ids_ls = []
        concept_ls = []
        new_content = []
        for content in content_ls:
            ids = int(content.split("\t")[0])
            Cause = content.split("<e1>")[1].split("</e1>")[0]
            Effect = content.split("<e2>")[1].split("</e2>")[0]
            content_ = re.sub("<e1>","",content.split("\t")[1].split("\n")[0])
            content_ = re.sub("</e1>","",content_)
            content_ = re.sub("<e2>","",content_)
            content_ = re.sub("</e2>","",content_)
            content_ = re.sub('"',"",content_)
            new_content.append(content_)
            ids_ls.append(ids)
            concept_ls.append((Cause,Effect))
        assert len(label_ls_d) == len(concept_ls) == len(comment_ls) == len(ids_ls) == len(new_content)
    return ids_ls, concept_ls, comment_ls,new_content ,label_ls_d

In [None]:
train_path = "../input/dpw3-project-dataset/DPW3-RelationExtraction/TRAIN_FILE.TXT"
valid_path = "../input/dpw3-project-dataset/DPW3-RelationExtraction/FULL_TEST.txt"
train_ids, train_concepts, train_comments, train_content, train_labels = read_dataset(train_path)
valid_ids, valid_concepts, valid_comments, valid_content, valid_labels = read_dataset(valid_path)

In [None]:
train_ids[0], train_concepts[0], train_content[0], train_comments[0], train_labels[0]

## Check max length

## Here I use roberta-large for test
I will process this task as a QA problem, which takes the following schedule:
1. question: [content]
2. answer: answer_concepts

And the final input will be [question; answer]

In [None]:
# not use comment ls since no provided from test data
def convert_input(concept_ls, content_ls, label_ls, comment_ls,label_mapping,model_name = "roberta-large", max_length = 128):
    label_ls = [label_mapping[i] for i in label_ls]
    tokenizer = AutoTokenizer.from_pretrained("roberta-large")
    sep = [tokenizer.sep_token, tokenizer.sep_token]
    input_ids_ls, seg_ids_ls, att_mask_ls = [], [], []
    for content, answer_concept, comment in tqdm(zip(content_ls,concept_ls, comment_ls)):
        ans_tk = answer_concept[0]+" "+answer_concept[1]+" "+comment
        answer_tokens = tokenizer.tokenize(ans_tk)
        question_tokens = tokenizer.tokenize(content)
        while True:
            total_length = len(answer_tokens) + len(question_tokens)
            if total_length <= max_length - 4:
                break
            if len(question_tokens) >= len(answer_tokens):
                question_tokens.pop()
            else:
                answer_tokens.pop()
        tokens_a = [tokenizer.cls_token]+answer_tokens + sep
        tokens_b = question_tokens + [tokenizer.sep_token]
        all_tokens = tokens_a + tokens_b
        seg_ids = [0]*len(all_tokens)
        input_ids = tokenizer.convert_tokens_to_ids(all_tokens)
        input_mask = [1] * len(input_ids)
        # padding
        padding_length = max_length - len(input_ids)
        input_ids = input_ids + [0] * padding_length
        seg_ids = seg_ids + [0] * padding_length
        input_mask = input_mask + [0] * padding_length
        input_ids_ls.append(input_ids)
        seg_ids_ls.append(seg_ids)
        att_mask_ls.append(input_mask)
    assert len(input_ids_ls) == len(att_mask_ls) == len(seg_ids_ls)
    return input_ids_ls, att_mask_ls, seg_ids_ls, label_ls

In [None]:
# train_input_ids, train_att_mask, train_seg_ids, train_labels = convert_input(train_concepts, train_content,train_labels, label_mapping)
# valid_input_ids, valid_att_mask, valid_seg_ids, valid_labels = convert_input(valid_concepts, valid_content,valid_labels, label_mapping)

In [None]:
class Relation_cls(torch.utils.data.Dataset):
    def __init__(self, input_ids, att_mask, seg_ids, labels):
        self.input_ids = input_ids
        self.att_mask = att_mask
        self.seg_ids = seg_ids
        self.labels = labels
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, ids):
        return torch.tensor(self.input_ids[ids], dtype = torch.long), torch.tensor(self.att_mask[ids], dtype =torch.long), torch.tensor(self.seg_ids[ids], dtype = torch.long),  torch.tensor(self.labels[ids], dtype = torch.long)

In [None]:
# train_set = Relation_cls(train_input_ids, train_att_mask,train_seg_ids,train_labels)
# valid_set = Relation_cls(valid_input_ids, valid_att_mask,valid_seg_ids,valid_labels)

In [None]:
class Text_model(nn.Module):
    def __init__(self, model_name,path = None, num_class = 19):
        super().__init__()
        if path is None:
            # download model from website
            self.model = AutoModel.from_pretrained(model_name)
        else:
            self.model =AutoModel.from_pretrained(path)
        self.output_size = self.model.config.hidden_size
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.output_size, num_class)
    
    def forward(self, input_ids, seg_ids, att_mask):
        output = self.model(input_ids, attention_mask = att_mask, token_type_ids = seg_ids)
        pooled_output = output[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
# model = Text_model("roberta-large")

In [None]:
# train_iter = torch.utils.data.DataLoader(train_set, batch_size = 2)
# input_ids, att_mask, seg_ids,label = next(iter(train_iter))
# model(input_ids, seg_ids, att_mask)

In [None]:
def evaluate_accuracy(eval_set, model, criterion,device):
    model.to(device)
    n_samples, n_correct = 0, 0
    model.eval()
    loss_ls = []
    with torch.no_grad():
        for idx, value in enumerate(tqdm(eval_set)):
            input_ids, att_mask ,seg_ids,labels = value
            input_ids   = input_ids.to(device)
            att_mask    = att_mask.to(device)
            labels      = labels.to(device)
            seg_ids     = seg_ids.to(device)
            with amp.autocast(enabled= True):
                logits = model(input_ids,seg_ids, att_mask)  
            loss        = criterion(logits, labels)
            n_correct += (logits.argmax(1) == labels).sum().item()
            n_samples += labels.size(0)
            loss_ls.append(loss.item())
            #print("pass")
    return n_correct / n_samples, loss_ls

In [None]:
def train_with_amp(net, train_set, valid_set,criterion, optimizer, epochs,batch_size, scheduler, gradient_accumulate_step, max_grad_norm , num_gpu):
    net.train()   
    
    # instantiate a scalar object 
    ls          = []
    #device_ids  = [try_gpu(i) for i in range(num_gpu)]
    device  = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print("\ntrain on %s\n"%str(device))
    enable_amp  = True if "cuda" in device.type else False
    scaler      = amp.GradScaler(enabled= enable_amp)
    net.to(device)
    train_iter  = torch.utils.data.DataLoader(train_set, batch_size = batch_size,shuffle = True)
    valid_iter = torch.utils.data.DataLoader(valid_set, batch_size = batch_size)
    for epoch in range(epochs):
        net.train()
        for idx, value in enumerate(train_iter):
            input_ids, att_mask ,seg_ids,labels = value
            input_ids   = input_ids.to(device)
            att_mask    = att_mask.to(device)
            labels      = labels.to(device)
            seg_ids     = seg_ids.to(device)
            # when forward process, use amp
            with amp.autocast(enabled= enable_amp):
                output  = net(input_ids, seg_ids, att_mask)  
            loss        = criterion(output, labels)
            # prevent gradient to 0
            if gradient_accumulate_step > 1:
                # 如果显存不足，通过 gradient_accumulate 来解决
                loss    = loss/gradient_accumulate_step
            
            # 放大梯度，避免其消失
            scaler.scale(loss).mean().backward()
            # do the gradient clip
            gradient_norm = nn.utils.clip_grad_norm_(net.parameters(),max_grad_norm)
            if (idx + 1) % gradient_accumulate_step == 0:
                # 多少 step 更新一次梯度
                # 通过 scaler.step 来unscale 回梯度值， 如果气结果不是infs 和Nans， 调用optimizer.step()来更新权重
                # 否则忽略step调用， 保证权重不更新
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()
            # 每1000次计算 print 出一次loss
            if idx % 30 == 0 or idx == len(train_iter) -1:
                with torch.no_grad():
                    print("==============Epochs "+ str(epoch) + " ======================")
                    print("loss: " + str(loss) + "; grad_norm: " + str(gradient_norm))
                ls.append(loss.item())
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': net.state_dict(),
                    'param_groups': optimizer.state_dict()["param_groups"],
                    'loss': ls
                },"./checkpoint.params")
                
        print("start evaluation:...")
        acc, loss_ls = evaluate_accuracy(valid_iter, net, criterion,device)
        prediction_ls, labels_ls = prediction(valid_iter, net, device)
        pred = torch.cat(prediction_ls).cpu().numpy()
        labels_true = torch.cat(labels_ls).cpu().numpy()
        print("Micro:", f1_score(labels_true, pred, average="micro"))
        print("Macro:", f1_score(labels_true, pred, average="macro"))
        print("acc: ", acc)
        with open("train_log", "a") as f:
                f.write("Epoch %s, eval_accuracy %.4f, valid_loss %s, Macro %.4f, Micro %.4f \n"%(epoch, acc, loss_ls,f1_score(labels_true, pred, average="macro"),
                                                                                                 f1_score(labels_true, pred, average="micro")))

In [None]:
from sklearn.metrics import f1_score
def prediction(eval_set, model,device):
    model.to(device)
    prediction_ls, label_ls = [], []
    model.eval()
    loss_ls = []
    with torch.no_grad():
        for idx, value in enumerate(tqdm(eval_set)):
            input_ids, att_mask ,seg_ids,labels = value
            input_ids   = input_ids.to(device)
            att_mask    = att_mask.to(device)
            labels      = labels.to(device)
            seg_ids     = seg_ids.to(device)
            with amp.autocast(enabled= True):
                logits = model(input_ids,seg_ids, att_mask)  
            predict = logits.argmax(1)
            prediction_ls.append(predict)
            label_ls.append(labels)
            #print("pass")
    return prediction_ls, label_ls

Macro: 0.78, Micro: 0.83 for epochs 5. Try 3 more epochs:

Macro: 0.816, Micro: 0.85 for epochs 8. 

Try 2 more epochs with lower learning rate: 1e-7 and Adam optimizer: Macro: 0.823, Micro: 0.856

## Visulization

## Loss and accuracy

In [None]:
import re
def change_logs(path):
    accuracies = []
    valid_loss = []
    #valid_micro = []
    valid_macro = []
    with open(path, "r") as f:
        file = f.readlines()
        #print(file)
    for log_ in file:
        accuracies.append(ast.literal_eval(log_.split("eval_accuracy ")[1][0:6]))
        loss_ls = ast.literal_eval(log_.split("\n")[0].split("valid_loss ")[-1].split("Macro")[0])[0]
        valid_loss.append(np.mean(loss_ls))
        #print(valid_loss, accuracies)
        valid_macro.append(float(re.sub(" ", "", log_.split("Macro")[-1].split(", ")[0])))
#         break
    return np.array(accuracies), np.array(valid_loss), np.array(valid_macro)
#change_logs("../input/relation-cls-params/train_log")

## Init train with 5 epochs, we can see the accuracy and valid loss shown below

In [None]:
import matplotlib.pyplot as plt
import ast
path = "../input/relation-cls-params/train_log"
valid_acc, valid_loss, valid_Marco = change_logs(path)
r_int = 0.05
plt.figure()
epochs = ["epochs 0", "epochs 1"]
plt.plot(epochs, valid_acc , c = "red", label = "accuracy", linestyle = "--")
plt.scatter(epochs, valid_acc  , c = "red")
plt.scatter(epochs, valid_loss , c= "blue")
plt.scatter(epochs, valid_Marco , c= "Green")
plt.plot(epochs ,valid_loss, c="blue", label = "loss")
plt.plot(epochs ,valid_Marco, c="Green", label = "Marco")
plt.xlabel("epochs")
plt.title("Validation loss/accuracy per epochs/Marco")
plt.legend(loc = "best")
plt.annotate(r"acc:%.3f"%(valid_acc[-1] ), xy = (epochs[-1], valid_acc[-1]), xytext = (-50,+30), textcoords = "offset points",fontsize = 14,
            arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=.2'))
plt.annotate(r"loss:%.3f"%(valid_loss[-1]), xy = (epochs[-1], valid_loss[-1]), xytext = (+50,-30), textcoords = "offset points",fontsize = 12,
            arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=.2'))
plt.annotate(r"Macro-F1:%.3f"%(valid_Marco[-1]), xy = (epochs[-1], valid_Marco[-1]), xytext = (-120,-40), textcoords = "offset points",fontsize = 12,
            arrowprops=dict(arrowstyle='->',connectionstyle='arc3,rad=.2'))
plt.savefig("./validation.png")
# plt.ylim(0,1)

## Eval

In [None]:
# if __name__ == "__main__":
#     train_path = "../input/dpw3-project-dataset/DPW3-RelationExtraction/TRAIN_FILE.TXT"
#     valid_path = "../input/dpw3-project-dataset/DPW3-RelationExtraction/FULL_TEST.txt"
#     train_ids, train_concepts, train_comments, train_content, train_labels = read_dataset(train_path)
#     valid_ids, valid_concepts, valid_comments, valid_content, valid_labels = read_dataset(valid_path)
#     label_mapping = {label:ids for ids, label in enumerate(np.unique(train_labels).tolist())}
#     train_input_ids, train_att_mask, train_seg_ids, train_labels = convert_input(train_concepts, train_content, train_labels,train_comments, label_mapping)
#     valid_input_ids, valid_att_mask, valid_seg_ids, valid_labels = convert_input(valid_concepts, valid_content,valid_labels,valid_comments, label_mapping)

#     train_set = Relation_cls(train_input_ids, train_att_mask,train_seg_ids,train_labels)
#     valid_set = Relation_cls(valid_input_ids, valid_att_mask,valid_seg_ids,valid_labels)
#     batch_size = 16
#     valid_iter = torch.utils.data.DataLoader(valid_set, batch_size = batch_size)
#     model = Text_model("roberta-large")
#     dic = torch.load("../input/relation-cls-params/checkpoint.params")
#     model.load_state_dict(dic["model_state_dict"])
#     prediction_ls, labels_ls = prediction(valid_iter, model, torch.device("cuda"))
#     pred = torch.cat(prediction_ls).cpu().numpy()
#     labels_true = torch.cat(labels_ls).cpu().numpy()
#     print("Mirco score for final model:", f1_score(labels_true, pred, average="micro"))
#     print("Macro score for final model:", f1_score(labels_true, pred, average="macro"))

## Train from initial

In [None]:
# if __name__ == "__main__":
#     train_path = "../input/dpw3-project-dataset/DPW3-RelationExtraction/TRAIN_FILE.TXT"
#     valid_path = "../input/dpw3-project-dataset/DPW3-RelationExtraction/FULL_TEST.txt"
#     train_ids, train_concepts, train_comments, train_content, train_labels = read_dataset(train_path)
#     valid_ids, valid_concepts, valid_comments, valid_content, valid_labels = read_dataset(valid_path)
#     label_mapping = {label:ids for ids, label in enumerate(np.unique(train_labels).tolist())}
#     train_input_ids, train_att_mask, train_seg_ids, train_labels = convert_input(train_concepts, train_content, train_labels,train_comments, label_mapping)
#     valid_input_ids, valid_att_mask, valid_seg_ids, valid_labels = convert_input(valid_concepts, valid_content,valid_labels,valid_comments, label_mapping)
#     train_set = Relation_cls(train_input_ids, train_att_mask,train_seg_ids,train_labels)
#     valid_set = Relation_cls(valid_input_ids, valid_att_mask,valid_seg_ids,valid_labels)
#     criterion = nn.CrossEntropyLoss()
#     batch_size = 16
#     lr = 2e-6
#     num_gpu = 1
#     model = Text_model("roberta-large")
#     optimizer = torch.optim.AdamW(model.parameters(), lr = lr)
#     scheduler = get_cosine_schedule_with_warmup(optimizer= optimizer, num_warmup_steps = 0, 
#                                                 num_training_steps= len(torch.utils.data.DataLoader(train_set, batch_size = batch_size)), num_cycles = 0.5)
#     train_with_amp(model, train_set, valid_set, criterion, optimizer, epochs=6, batch_size = batch_size, scheduler=scheduler, gradient_accumulate_step=1,
#                   max_grad_norm=1000, num_gpu=num_gpu)

## Fine tune

In [None]:
# if __name__ == "__main__":
#     train_path = "../input/dpw3-project-dataset/DPW3-RelationExtraction/TRAIN_FILE.TXT"
#     valid_path = "../input/dpw3-project-dataset/DPW3-RelationExtraction/FULL_TEST.txt"
#     train_ids, train_concepts, train_comments, train_content, train_labels = read_dataset(train_path)
#     valid_ids, valid_concepts, valid_comments, valid_content, valid_labels = read_dataset(valid_path)
#     label_mapping = {label:ids for ids, label in enumerate(np.unique(train_labels).tolist())}
#     train_input_ids, train_att_mask, train_seg_ids, train_labels = convert_input(train_concepts, train_content, train_labels,train_comments, label_mapping)
#     valid_input_ids, valid_att_mask, valid_seg_ids, valid_labels = convert_input(valid_concepts, valid_content,valid_labels,valid_comments, label_mapping)

#     train_set = Relation_cls(train_input_ids, train_att_mask,train_seg_ids,train_labels)
#     valid_set = Relation_cls(valid_input_ids, valid_att_mask,valid_seg_ids,valid_labels)
#     criterion = nn.CrossEntropyLoss()
#     batch_size = 16
#     lr = 1e-6
#     num_gpu = 1
#     model = Text_model("roberta-large")
#     dic = torch.load("../input/relation-cls-params/checkpoint.params")
#     model.load_state_dict(dic["model_state_dict"])
#     optimizer = torch.optim.Adam(model.parameters(), lr = lr)
#     scheduler = get_cosine_schedule_with_warmup(optimizer= optimizer, num_warmup_steps = 0, 
#                                                 num_training_steps= len(torch.utils.data.DataLoader(train_set, batch_size = batch_size)), num_cycles = 0.5)
#     train_with_amp(model, train_set, valid_set, criterion, optimizer, epochs=2, batch_size = batch_size, scheduler=scheduler, gradient_accumulate_step=1,
#                   max_grad_norm=1000, num_gpu=num_gpu)