## Data Processing

Generally, there are several steps as following:

1. Transform raw data to standard format(CSV), some supported format lists as following(When dealing with the raw data, several exceptions should be considered):
    1. There exists blank lines, inconsistent columns(!=3)
    2. Head, relation, tail is nan or string that only contains space
    3. Some string will be parsed as nan, exp.NAN, N/A, NA and so on
2. Use data of standard format to generate entity and relation dict
3. Split train, evaluation and test data if necessary.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# import sys

# sys.path.append("../../")

import re
import os
import json
import codecs
import numpy as np
import pandas as pd
from collections import Counter


import torch
import pickle
from config import Config

from utils import utils
from models import TransE, TransH, TransA, TransD, KG2E
from utils import evaluation
from dataloader.dataloader import tripleDataset
from torch.utils.data import DataLoader

from torch.autograd import Variable
from tensorboardX import SummaryWriter

## Hyper-parameters

In [3]:
from config import Config
default_conf = Config()

## Helpers

In [4]:
def csv_process(raw_path, save_path, names=None, header=None, sep="\t", encoding="utf-8", compression="infer"):
    print("INFO : Loading data of type %s" % os.path.splitext(raw_path)[-1])
    raw_df = pd.read_csv(raw_path,
                        sep=sep,
                        encoding=encoding,
                        names=names,
                        header=header,
                        keep_default_na=False,  # ==> Solve default nan
                        compression=compression,# ==> Solve ZIP and TAR
                        error_bad_lines=False,  # ==> Solve inconsistent lines
                        warn_bad_lines=False,   # ==> Solve inconsistent lines
                        skip_blank_lines=True)  # ==> Solve blank lines
    print("INFO : Remove the space from the head and tail of entity.")
    raw_df = raw_df.applymap(lambda x: x.strip())  # Rid of the space in head and tail of entity
    print("INFO : Drop line with nan value.")    # Attention: " " should be removed.
    raw_df.replace({'': np.nan}, inplace=True)
    raw_df.dropna(axis=0, how='any', inplace=True)

    print("INFO : Save standard data to file path : %s" % save_path)
    raw_df.to_csv(save_path, sep="\t", header=None, index=None, encoding="utf-8")
    print("INFO : Successfully saving!")
    return raw_df


def generate_dict(data_dfs, dict_save_dir):

    raw_df = pd.concat(data_dfs, axis=0)
    raw_df.reset_index(drop=True, inplace=True)

    head_counter = Counter(raw_df["head"])
    tail_counter = Counter(raw_df["tail"])
    rela_counter = Counter(raw_df["relation"])

    # Generate entity and relation list
    entity_list = list((head_counter + tail_counter).keys())
    rela_list = list(rela_counter.keys())

    # Transform to index dict
    print("INFO : Transform to index dict")
    entity_dict = dict([(word, ind) for ind, word in enumerate(entity_list)])
    rela_dict = dict([(word, ind) for ind, word in enumerate(rela_list)])

    # Save path
    entity_dict_path = os.path.join(dict_save_dir, "entity_dict.json")
    rela_dict_path = os.path.join(dict_save_dir, "relation_dict.json")

    # Saving dicts
    json.dump({"stoi": entity_dict, "itos": entity_list}, open(entity_dict_path, "w"))
    json.dump({"stoi": rela_dict, 'itos': rela_list}, open(rela_dict_path, "w"))

    return {"Entity": {"stoi": entity_dict, "itos": entity_list}, 
            "Rela": {"stoi": rela_dict, 'itos': rela_list} }

In [5]:

dict_dir = "../../data/TransX"
train_file = "../../data/TransX/train.txt"
valid_file = "../../data/TransX/valid.txt"
test_file = "../../data/TransX/test.txt"
# Step1: Transform raw data to standard format
train_df = csv_process(raw_path=train_file,
                save_path=default_conf.pos_path,
                names=["head", "relation", "tail"],
                header=None,
                sep="\t",
                encoding="utf-8")
valid_df = csv_process(raw_path=valid_file,
                save_path=default_conf.valid_path,
                names=["head", "relation", "tail"],
                header=None,
                sep="\t",
                encoding="utf-8")
test_df = csv_process(raw_path=test_file,
                save_path=default_conf.test_path,
                names=["head", "relation", "tail"],
                header=None,
                sep="\t",
                encoding="utf-8")

entity_rela_dict = generate_dict(data_dfs=[train_df, valid_df, test_df],
                 dict_save_dir=dict_dir)

INFO : Loading data of type .txt




  raw_df = pd.read_csv(raw_path,


  raw_df = pd.read_csv(raw_path,


INFO : Remove the space from the head and tail of entity.
INFO : Drop line with nan value.
INFO : Save standard data to file path : ../../data/TransX/train.txt
INFO : Successfully saving!
INFO : Loading data of type .txt
INFO : Remove the space from the head and tail of entity.
INFO : Drop line with nan value.
INFO : Save standard data to file path : ../../data/TransX/valid.txt




  raw_df = pd.read_csv(raw_path,


  raw_df = pd.read_csv(raw_path,


INFO : Successfully saving!
INFO : Loading data of type .txt
INFO : Remove the space from the head and tail of entity.
INFO : Drop line with nan value.
INFO : Save standard data to file path : ../../data/TransX/valid.txt




  raw_df = pd.read_csv(raw_path,


  raw_df = pd.read_csv(raw_path,


INFO : Successfully saving!
INFO : Transform to index dict


In [6]:
entity_rela_dict.keys()

dict_keys(['Entity', 'Rela'])

In [7]:
len(entity_rela_dict["Rela"]["stoi"]), len(entity_rela_dict["Entity"]["stoi"])

(1345, 14951)

## dataloaders

In [8]:

# Initialize dataset and dataloader
# If print(dataset[:]), you can get the result like:
#   (np.array(N, 3, dtype=int64), np.array(N, 3, dtype=int64))
# The first array represents the positive triples, while
#   the second array represents the negtive ones.
#   N is the size of all data.
repSeed = 0
exSeed = 0
headSeed = 0
tailSeed = 0

train_dataset = tripleDataset(posDataPath=default_conf.pos_path,
                    entityDictPath=default_conf.ent_path,
                    relationDictPath=default_conf.rel_path)

train_dataset.generateNegSamples(repProba=default_conf.rep_proba,
                        exProba=default_conf.ex_proba,
                        repSeed=repSeed,
                        exSeed=exSeed,
                        headSeed=headSeed,
                        tailSeed=tailSeed)
train_dataloader = DataLoader(train_dataset,
                    batch_size=default_conf.batch_size,
                    shuffle=default_conf.shuffle,
                    num_workers=default_conf.num_workers,
                    drop_last=default_conf.drop_last)


valid_dataset = tripleDataset(posDataPath=default_conf.valid_path,
                        entityDictPath=default_conf.ent_path,
                        relationDictPath=default_conf.rel_path)
valid_dataloader = DataLoader(valid_dataset,
                        batch_size=len(valid_dataset),
                        shuffle=False,
                        drop_last=False)


INFO : Load entity and relation dict.
INFO : Loading positive triples and transform to index.
INFO : Generate negtive samples from positive samples.
INFO : Load entity and relation dict.
INFO : Loading positive triples and transform to index.


In [9]:
batch = next(iter(train_dataloader))

In [10]:
batch[0] 

tensor([[12986,   114, 12305],
        [  863,   261,  3825],
        [ 9493,    46,   427],
        ...,
        [ 4272,    73,   202],
        [ 6031,    79,   496],
        [  574,     8,   758]])

In [11]:
batch[1]

tensor([[  758,   114, 12305],
        [  863,   261,  5635],
        [ 9493,    46,  1693],
        ...,
        [ 4272,    73,  5533],
        [ 6031,    79,   341],
        [  574,     8,  4331]])

In [12]:
def prepare_train_dataloader(seed):
    epSeed = seed
    exSeed = seed
    headSeed = seed
    tailSeed = seed

    train_dataset = tripleDataset(posDataPath=default_conf.pos_path,
                        entityDictPath=default_conf.ent_path,
                        relationDictPath=default_conf.rel_path)

    train_dataset.generateNegSamples(repProba=default_conf.rep_proba,
                            exProba=default_conf.ex_proba,
                            repSeed=repSeed,
                            exSeed=exSeed,
                            headSeed=headSeed,
                            tailSeed=tailSeed)
    train_dataloader = DataLoader(train_dataset,
                        batch_size=default_conf.batch_size,
                        shuffle=default_conf.shuffle,
                        num_workers=default_conf.num_workers,
                        drop_last=default_conf.drop_last)
    
    return train_dataloader


def prepare_val_datset():
    valid_dataset = tripleDataset(posDataPath=default_conf.valid_path,
                        entityDictPath=default_conf.ent_path,
                        relationDictPath=default_conf.rel_path)
    valid_dataloader = DataLoader(valid_dataset,
                            batch_size=len(valid_dataset),
                            shuffle=False,
                            drop_last=False)

    return valid_dataloader

## Modeling

In [13]:
model = TransE.TransE(entityNum=len(entity_rela_dict["Entity"]["stoi"]),
                    relationNum=len(entity_rela_dict["Rela"]["stoi"]),
                    embeddingDim=default_conf.TransE["EmbeddingDim"],
                    margin=default_conf.TransE["Margin"],
                    L=default_conf.TransE["L"])
model.to(default_conf.device)
                    
optimizer = torch.optim.Adam(model.parameters(),
                                weight_decay= default_conf.weight_decay,
                                lr=default_conf.learning_rate)

lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=default_conf.lr_decay, patience=default_conf.lr_decay_epoch, threshold=0.0001)

## Training

In [14]:
sumWriter = SummaryWriter(log_dir=default_conf.summary_dir)

# Training, GLOBALSTEP and GLOBALEPOCH are used for summary
minLoss = float("inf")
bestMR = float("inf")

GLOBALSTEP = 0
GLOBALEPOCH = 0

for seed in range(1):
    print("INFO : Using seed %d" % seed)

    train_dataloader = prepare_train_dataloader(seed)

    for epoch in range(default_conf.epochs):
        GLOBALEPOCH += 1
        STEP = 0
        print("="*20+"EPOCHS(%d/%d)"%(epoch+1, default_conf.epochs)+"="*20)
        for batch_i, (posX, negX) in enumerate(train_dataloader):
            # Allocate tensor to devices
               
            posX = torch.LongTensor(posX).to(default_conf.device)
            negX = torch.LongTensor(negX).to(default_conf.device)
            # Normalize the embedding if neccessary
            model.normalizeEmbedding()

            # Calculate the loss from the model
            loss = model(posX, negX)
            lossVal = loss.cpu().item()

            # Calculate the gradient and step down
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print infomation and add to summary
            if minLoss > lossVal:
                minLoss = lossVal
            print("[TRAIN-EPOCH(%d/%d)-STEP(%d/%d)]Loss:%.4f, minLoss:%.4f"%(epoch+1, default_conf.epochs, STEP,len(train_dataloader.dataset),lossVal, minLoss))
            STEP += 1
            GLOBALSTEP += 1
            sumWriter.add_scalar('train/loss', lossVal, global_step=GLOBALSTEP)
            
            if batch_i == 5:
                break

        # if GLOBALEPOCH % default_conf.eval_epoch == 0:
        #     MR = evaluation.MREvaluation(evalloader=valid_dataloader,
        #                                     model=default_conf.model_name,
        #                                     simMeasure=default_conf.sim_measure,
        #                                     **model.retEvalWeights())
        #     sumWriter.add_scalar('train/eval', MR, global_step=GLOBALEPOCH)
        #     print("[EVALUATION-EPOCH(%d/%d)]Measure method %s, eval %.4f"% \
        #             (epoch+1, default_conf.epochs, default_conf.evalmethod, MR))
        #     # Save the model if new MR is better
        #     if MR < bestMR:
        #         bestMR = MR
        #         model_path = os.path.join(default_conf.model_path, "TransE_Model_MR_{}.pt".format(round(bestMR,4)))
        #         pkl_path = os.path.join(default_conf.embed_path, "TransE_Embedding_MR_{}.pkl".format(round(bestMR,4)))
        #         torch.save(model.state_dict(), model_path)
        #         with open(pkl_path, "wb") as fp:
        #             pickle.dump({"entlist" : entity_rela_dict["Entity"]["stoi"],
        #                             "rellist" : entity_rela_dict["Rela"]["stoi"],
        #                             "weights" : model.retEvalWeights()}, fp)
        
        

INFO : Using seed 0
INFO : Load entity and relation dict.
INFO : Loading positive triples and transform to index.
INFO : Generate negtive samples from positive samples.
[TRAIN-EPOCH(1/5)-STEP(0/483142)]Loss:1.0075, minLoss:1.0075
[TRAIN-EPOCH(1/5)-STEP(1/483142)]Loss:0.9963, minLoss:0.9963
[TRAIN-EPOCH(1/5)-STEP(2/483142)]Loss:0.9981, minLoss:0.9963
[TRAIN-EPOCH(1/5)-STEP(3/483142)]Loss:0.9885, minLoss:0.9885
[TRAIN-EPOCH(1/5)-STEP(4/483142)]Loss:0.9849, minLoss:0.9849
[TRAIN-EPOCH(1/5)-STEP(5/483142)]Loss:0.9815, minLoss:0.9815
[TRAIN-EPOCH(2/5)-STEP(0/483142)]Loss:0.9589, minLoss:0.9589
[TRAIN-EPOCH(2/5)-STEP(1/483142)]Loss:0.9682, minLoss:0.9589
[TRAIN-EPOCH(2/5)-STEP(2/483142)]Loss:0.9582, minLoss:0.9582
[TRAIN-EPOCH(2/5)-STEP(3/483142)]Loss:0.9401, minLoss:0.9401
[TRAIN-EPOCH(2/5)-STEP(4/483142)]Loss:0.9452, minLoss:0.9401
[TRAIN-EPOCH(2/5)-STEP(5/483142)]Loss:0.9492, minLoss:0.9401
[TRAIN-EPOCH(3/5)-STEP(0/483142)]Loss:0.9315, minLoss:0.9315
[TRAIN-EPOCH(3/5)-STEP(1/483142)]Loss: