**Knowledge Graph Embedding, model training, and evaluation using the TorchKGE library**

In [34]:
############################################################
# imports
############################################################

from torch import cuda
from torch.optim import Adam
import pickle 

from torchkge.models import TransEModel, ComplExModel, ConvKBModel
from torchkge.sampling import BernoulliNegativeSampler, UniformNegativeSampler
from torchkge.utils import Trainer, MarginLoss, DataLoader
from torchkge.evaluation import LinkPredictionEvaluator
from torchkge.evaluation import RelationPredictionEvaluator
from torchkge.evaluation import TripletClassificationEvaluator

import json
import networkx as nx
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torchkge.data_structures import KnowledgeGraph
from tqdm.autonotebook import tqdm

import pandas as pd


In [2]:
############################################################
# load files
############################################################

#list_of_lists = pickle.load(open('data/all_KGs_as_lists2.pkl', 'rb'))
list_of_lists = pickle.load(open('data/all_rows.pkl', 'rb'))
#df_list
df_list_old = pickle.load(open('data/all_Dataframes.pkl','rb'))
df_list = pickle.load(open('data/all_Dataframes2.pkl', 'rb'))
#all Knowledge Graph objects
KGs_old = pickle.load(open('data/all_Knowledge_Graphs.pkl','rb'))
KGs = pickle.load(open('data/all_Knowledge_Graphs2.pkl', 'rb'))

In [9]:
############################################################
# 1. making KG objects from dataframes
# note: no need to run again unless changes were made to 'list_of_lists', just load 'KGs' instead
############################################################
KGs = [] 
for row in list_of_lists:
    if row:
        data=pd.DataFrame(row,columns=['from','to','rel'])
        kg = KnowledgeGraph(df=data)
        KGs.append(kg)

print(len(KGs)) # total number of dataframes: 8471 (8469)
pickle.dump(KGs, open('data/all_Knowledge_Graphs2.pkl', 'wb'))

8471


In [10]:
################################################
# collect KGs with 'Handlung: Kreuzigen'
# idea: mimics what a user has previously looked at & uses that as base for recommendation
############################################################

KGs = pickle.load(open('data/all_Knowledge_Graphs2.pkl', 'rb'))
graphs = []
sets = []
for x in enumerate(KGs):
    index = x[0]
    kg = x[1]
    df = kg.get_df()
    val = df['to'].str.contains('kreuzigen').sum()
    if val > 0:
        graphs.append(df)
        #print('kreuzigen was found in: ',index)




In [26]:
############################################################
# preparing kg_train, kg_val for model and eval
############################################################
print(len(graphs))
complete_KG = pd.concat(graphs[:5], axis=0)
#complete_KG = graphs[:2]
complete_KG = KnowledgeGraph(df=complete_KG)
sizes = complete_KG.get_sizes(complete_KG.n_facts,share=0.8)
sets = complete_KG.split_kg(share=0.8,sizes=sizes,validation=True)
kg_train = sets[0]
kg_val = sets[1]

126


In [27]:

datafr = complete_KG.get_df()

pickle.dump(datafr, open('results/data/model_input_df.pkl', 'wb'))


In [28]:
############################################################
# Model Training TransE
############################################################
#KGs = pickle.load(open('data/all_Knowledge_Graphs.pkl', 'rb'))

# Define some hyper-parameters for training
emb_dim = 10
lr = 0.0004
n_epochs = 1000
b_size = 128
margin = 0.5

kg_train = complete_KG

# Define the model and criterion
model = TransEModel(emb_dim, kg_train.n_ent, kg_train.n_rel, dissimilarity_type='L2')
criterion = MarginLoss(margin)

# Move everything to CUDA if available
if cuda.is_available():
    cuda.empty_cache()
    model.cuda()
    criterion.cuda()

# Define the torch optimizer to be used
optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5)

sampler = BernoulliNegativeSampler(kg_train) #UniformNegativeSampler(kg_train, kg_val=kg_val, kg_test=kg_test)
dataloader = DataLoader(kg_train, batch_size=b_size)

iterator = tqdm(range(n_epochs), unit='epoch')
for epoch in iterator:
    running_loss = 0.0
    for i, batch in enumerate(dataloader):
        h, t, r = batch[0], batch[1], batch[2]
        n_h, n_t = sampler.corrupt_batch(h, t, r)

        optimizer.zero_grad()

        # forward + backward + optimize
        pos, neg = model(h, t, r, n_h, n_t)
        loss = criterion(pos, neg)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    iterator.set_description(
        'Epoch {} | mean loss: {:.5f}'.format(epoch + 1,
                                              running_loss / len(dataloader)))
    model.normalize_parameters()
model.normalize_parameters()

pickle.dump(model, open('results/data/model.pkl', 'wb'))

Epoch 1000 | mean loss: 1.62740: 100%|██████████| 1000/1000 [00:54<00:00, 18.31epoch/s]


In [43]:

# Triplet classification evaluation on test set by learning thresholds on validation set
model = pickle.load(open('results/deep/model.pkl', 'rb')) #change file if needed
KGs = pickle.load(open('data/all_Knowledge_Graphs2.pkl', 'rb'))
err_count = 0
eval_results = []
for x in enumerate(KGs):
    #print (x[0])
    index = x[0]
    if type(x[1]) == KnowledgeGraph:
        try:
            kg_test = x[1]

            evaluator = TripletClassificationEvaluator(model, kg_val, kg_test)
            evaluator.evaluate(b_size=264)
            eval_res = evaluator.accuracy(b_size=264)
            eval_results.append([eval_res, kg_test])
            #print(eval_res)
            #print('Accuracy on test set: {}'.format(evaluator.accuracy(b_size=128)))

            #evaluator = LinkPredictionEvaluator(model,kg_test)
            #evaluator.evaluate(b_size=264)
            #evaluator.print_results(eval_results)
        except:
            err_count+=1
    
print("Error Count: ",err_count,"/",len(KGs))    
print(len(eval_results))

pickle.dump(eval_results, open("results/deep/Triplet_Classification.pkl", "wb")) #change file if needed


Error Count:  27 / 8471
8444


In [31]:
pickle.dump(eval_results, open("results/transE/Triplet_Classification.pkl", "wb"))
pickle.dump(model, open("results/transE/model.pkl","wb"))

In [None]:
#how to make the different evals#
#eval = RelationPredictionEvaluator(model, KGs[9], directed=True)
#eval.evaluate(b_size=b_size)
#eval.print_results()

# Link prediction evaluation on test set.
#evaluator = LinkPredictionEvaluator(model, KGs[9])
#evaluator.evaluate(b_size=32)
#evaluator.print_results()

# Triplet classification evaluation on test set by learning thresholds on validation set
#evaluator = TripletClassificationEvaluator(model, kg_val, kg_test)
#evaluator.evaluate(b_size=128)
#print('Accuracy on test set: {}'.format(evaluator.accuracy(b_size=128)))

In [49]:

# relation prediction evaluation on test set
model = pickle.load(open('results/bilinear/model.pkl', 'rb'))
KGs = pickle.load(open('data/all_Knowledge_Graphs2.pkl', 'rb'))
err_count=0
eval_results = []
for x in enumerate(KGs):
    #print (x[0])
    index = x[0]
    if type(x[1]) == KnowledgeGraph:
        try:
            kg_test = x[1]
            eval = RelationPredictionEvaluator(model, kg_test, directed=True)
            eval.evaluate(b_size=264, verbose=False)
            #eval.print_results()
            mean = eval.mean_rank()
            hit = eval.hit_at_k()
            mrr = eval.mrr()
            eval_results.append([[mean,hit,mrr], kg_test])
            #print(eval_res)
            #print('Accuracy on test set: {}'.format(evaluator.accuracy(b_size=128)))

            #evaluator = LinkPredictionEvaluator(model,kg_test)
            #evaluator.evaluate(b_size=264)
            #eval.print_results()
        except:
            err_count+=1
    
print("Error Count: ",err_count,"/",len(KGs))    
print(len(eval_results))

pickle.dump(eval_results, open("results/bilinear/Relation_Prediction.pkl", "wb"))

Error Count:  27 / 8471
8444


In [None]:
pickle.dump(eval_results, open("graphs/first_try/eval_TransE_Rel_Predict_2.pkl", "wb"))

In [46]:

# link prediction evaluation on test set
model = pickle.load(open('results/bilinear/model.pkl', 'rb'))
KGs = pickle.load(open('data/all_Knowledge_Graphs2.pkl', 'rb'))
err_count=0
eval_results = []
for x in enumerate(KGs):
    #print (x[0])
    index = x[0]
    if type(x[1]) == KnowledgeGraph:
        try:
            kg_test = x[1]
            eval = LinkPredictionEvaluator(model, kg_test)
            eval.evaluate(b_size=264, verbose=False)
            #eval.print_results()
            mean = eval.mean_rank()
            hit = eval.hit_at_k()
            mrr = eval.mrr()
            eval_results.append([[mean,hit,mrr], kg_test])
            #print(eval_res)
            #print('Accuracy on test set: {}'.format(evaluator.accuracy(b_size=128)))

            #evaluator = LinkPredictionEvaluator(model,kg_test)
            #evaluator.evaluate(b_size=264)
            #eval.print_results()
        except:
            err_count+=1
    
print("Error Count: ",err_count,"/",len(KGs))    
print(len(eval_results))

pickle.dump(eval_results, open("results/bilinear/Link_Prediction.pkl", "wb"))

Error Count:  27 / 8471
8444


bilinear model

In [35]:
############################################################
# Model Training ComplEx
############################################################
#KGs = pickle.load(open('data/all_Knowledge_Graphs.pkl', 'rb'))
# Load dataset
#kg_train, _, _ = load_fb15k()
#kg_train = KGs[:10]
#kg_val = KGs[103]
#kg_test = KGs[200]

# Define some hyper-parameters for training
emb_dim = 10
lr = 0.0004
n_epochs = 1000
b_size = 128
margin = 0.5

kg_train = complete_KG

# Define the model and criterion
model = ComplExModel(emb_dim, kg_train.n_ent, kg_train.n_rel)
criterion = MarginLoss(margin)

# Move everything to CUDA if available
if cuda.is_available():
    cuda.empty_cache()
    model.cuda()
    criterion.cuda()

# Define the torch optimizer to be used
optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5)

sampler = BernoulliNegativeSampler(kg_train) #UniformNegativeSampler(kg_train, kg_val=kg_val, kg_test=kg_test)
dataloader = DataLoader(kg_train, batch_size=b_size)

iterator = tqdm(range(n_epochs), unit='epoch')
for epoch in iterator:
    running_loss = 0.0
    for i, batch in enumerate(dataloader):
        h, t, r = batch[0], batch[1], batch[2]
        n_h, n_t = sampler.corrupt_batch(h, t, r)

        optimizer.zero_grad()

        # forward + backward + optimize
        pos, neg = model(h, t, r, n_h, n_t)
        loss = criterion(pos, neg)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    iterator.set_description(
        'Epoch {} | mean loss: {:.5f}'.format(epoch + 1,
                                              running_loss / len(dataloader)))
    model.normalize_parameters()
model.normalize_parameters()

pickle.dump(model, open('results/bilinear/model.pkl', 'wb'))

Epoch 1000 | mean loss: 0.47335: 100%|██████████| 1000/1000 [00:54<00:00, 18.26epoch/s]


In [39]:
############################################################
# Model Training ConvKB
############################################################
#KGs = pickle.load(open('data/all_Knowledge_Graphs.pkl', 'rb'))
# Load dataset
#kg_train, _, _ = load_fb15k()
#kg_train = KGs[:10]
#kg_val = KGs[103]
#kg_test = KGs[200]

# Define some hyper-parameters for training
emb_dim = 10
lr = 0.0004
n_epochs = 1000
b_size = 128
margin = 0.5

kg_train = complete_KG

# Define the model and criterion
model = ConvKBModel(emb_dim, 12, kg_train.n_ent, kg_train.n_rel)
criterion = MarginLoss(margin)

# Move everything to CUDA if available
if cuda.is_available():
    cuda.empty_cache()
    model.cuda()
    criterion.cuda()

# Define the torch optimizer to be used
optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5)

sampler = BernoulliNegativeSampler(kg_train) #UniformNegativeSampler(kg_train, kg_val=kg_val, kg_test=kg_test)
dataloader = DataLoader(kg_train, batch_size=b_size)

iterator = tqdm(range(n_epochs), unit='epoch')
for epoch in iterator:
    running_loss = 0.0
    for i, batch in enumerate(dataloader):
        h, t, r = batch[0], batch[1], batch[2]
        n_h, n_t = sampler.corrupt_batch(h, t, r)

        optimizer.zero_grad()

        # forward + backward + optimize
        pos, neg = model(h, t, r, n_h, n_t)
        loss = criterion(pos, neg)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    iterator.set_description(
        'Epoch {} | mean loss: {:.5f}'.format(epoch + 1,
                                              running_loss / len(dataloader)))
    model.normalize_parameters()
model.normalize_parameters()

pickle.dump(model, open('results/deep/model.pkl', 'wb')) 

Epoch 1000 | mean loss: 5.29200: 100%|██████████| 1000/1000 [01:57<00:00,  8.50epoch/s]
