We downloaded and saved [DBpedia](https://www.dbpedia.org/blog/dbpedia-snapshot-2022-09-release/) in parquet compression format (dbpedia_09_2022.parquet). We also downloaded [Wikidata](https://dumps.wikimedia.org/wikidatawiki/) as data.txt

In [None]:
# convert data from parquet to txt
from fastparquet import ParquetFile
filename = "dbpedia_09_2022.parquet"
pf = ParquetFile(filename)
df = pf.to_pandas()
df.to_csv("dbpedia_09_2022.txt",sep='\t',index=False)
print("OK")

In [None]:
file_rel_DB ='dbpedia_09_2022.txt' # dbpedia initial file
file_rel_write_sameas = 'sameas.txt' # file to write and store sameAs links
file_rel_DB_without_sameas ='dbpedia_09_2022_clean.txt' # to store well-formatted triples in dbpedia

dbpedia_sameas_file = open(file_rel_write_sameas, 'w', encoding='utf-8')
dbpedia_clean_file = open(file_rel_DB_without_sameas, 'w', encoding='utf-8')

def read_triple_dbp_raw(file_path):
    num1=0
    num2=0
    outputsame=''
    outputwithouterror=''
    outputwithouterrorsameas=''
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            
            line = line.strip('\n').split()
            if len(line) != 3:
                continue
            s = line[0].lstrip('<').rstrip('>')
            p = line[1].lstrip('<').rstrip('>')
            o = line[2].lstrip('<').rstrip('>')
            if 'http://www.w3.org/2002/07/owl#sameAs' in p  and 'http://www.wikidata.org/entity'  in o:
                num1=num1+1
                outputsame='<'+ s +'>'+'\t'+'<'+ p +'>'+'\t'+'<'+ o +'>\t.\n'
                dbpedia_sameas_file.write(outputsame)
            if 'http://www.w3.org/2002/07/owl#sameAs' not in p:
                num2=num2+1
                outputwithouterrorsameas='<'+ s +'>'+'\t'+'<'+ p +'>'+'\t'+'<'+ o +'>\t.\n'
                dbpedia_clean_file.write(outputwithouterrorsameas)
                
        print(num1)
        print(num2)
                

            
read_triple_dbp_raw(file_rel_DB)

In [None]:
#delete 1st line in dbpedia_09_2022_clean.txt because it contains column names, i.e., <subjetc> <relation> <object>
sed -i '1d' dbpedia_09_2022_clean.txt

Define functions to merge DBpedia and Wikidata 

In [None]:
file_rel_in_WIKI = 'data.txt' # relation triples in wikidata
file_rel_in_DB = 'dbpedia_09_2022_clean.txt' # well formatted triples in dbpedia
sameas_links = 'sameas.txt' # sameAs links
filewrite = open('mergeentities.txt', 'a', encoding='utf-8') # file to store the fusion of dbpedia and wikidata triples via sameAs links; we replace matching
# entity names by their corresponding names in dbpedia

# function to read sameAs links and build a python dictionary
def read_sameAs_and_build_dict(file_path):
    dictwiki=dict()
  # keys will be entity labels in wikidata and values their corresponding names in dbpedia
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip('\t.\n').split()
            if len(line) != 3:
                
                continue
            s = line[0].lstrip('<').rstrip('>') #db
            p = line[1].lstrip('<').rstrip('>')
            o = line[2].lstrip('<').rstrip('>')#wiki
            dictwiki[o]=s
        return dictwiki


def replace_wikidata_labels_by_corr_dbpedia_names(file_path, mapping):
    """ Replace matching entity names by their corresponding names in dbpedia """
#(wiki,db)
    output = ''
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip('\t.\n').split()
            if len(line) != 3:
                continue
            s = line[0].lstrip('<').rstrip('>')# wiki
            p = line[1].lstrip('<').rstrip('>')
            o = line[2].lstrip('<').rstrip('>')# wiki
            if s in mapping.keys() or o in mapping.keys():
                renamesubject=mapping.get(s) #wik
                if renamesubject is None:
                    subject=str(s).replace("http://www.wikidata.org/","http://embedding.cc/")
                else:
                    subject=str(renamesubject).replace("http://dbpedia.org/","http://embedding.cc/")
                renameobjectt=mapping.get(o)
                if renameobjectt is None:
                    objectt=str(o).replace("http://www.wikidata.org/","http://embedding.cc/")
                else:
                    objectt=str(renameobjectt).replace("http://dbpedia.org/","http://embedding.cc/")
            else:
                subject=str(s).replace("http://dbpedia.org/","http://embedding.cc/")
                objectt=str(o).replace("http://dbpedia.org/","http://embedding.cc/")
            output = '<'+ subject +'>'+'\t'+'<'+ str(p) +'>'+'\t'+'<'+ objectt +'>\t.\n'
            filewrite.write(output)

First write Dbpedia triples into the merge file

In [None]:
def read_write_dbpedia_triples_to_merge_kg(dbpedia_file):
    with open(dbpedia_file, 'r', encoding='utf-8') as file:
        for line in file:
            filewrite.write(line) # write into the merge file

In [None]:
# first write dbpedia triples into the merge file. wikidata triples will be added below using Algorithm 1 of our paper.
read_write_dbpedia_triples_to_merge_kg(file_rel_in_DB)

Now build the mapping dictionary using sameAs links then add wikidata triples into the merge file using Algorithm 1

In [None]:
wikidata_to_dbpedia = read_sameAs_and_build_dict(sameas_links) # sameAs links
replace_wikidata_labels_by_corr_dbpedia_names(file_rel_in_WIKI, wikidata_to_dbpedia) # add wikidata triples
filewrite.close()

Data statistics and average degree in KGs

In [None]:
#caculate the triples
def readline_count(file_name):
      with open(file_name, 'r', encoding='utf-8') as file:
        num=0
        for line in file:
            line = line.strip('\t.\n').split()
            if len(line) != 3:
                continue
            num=num+1
        print(num)
readline_count('mergeentities.txt')

In [None]:
#compute the average degree
from collections import defaultdict
def degree(file_name):
    kg_degree = defaultdict(lambda: 0)
    with open(file_name) as file:
        #data = file.readlines()
       # print("***Train*** Number of triples: ", len(data))
     for triple in file:
            triple = triple.strip('\t.\n').split()
            e1 = triple[0].lstrip('<').rstrip('>')
            r = triple[1].lstrip('<').rstrip('>')
            e2 = triple[2].lstrip('<').rstrip('>')
            kg_degree[e1] += 1
            kg_degree[e2] += 1
    return kg_degree

import numpy as np
degrees = degree("mergeentities.txt") ## Replace by KG file name here. In our case, you will do it for 3 KGs: mergeentities.txt, dbpedia_09_2022_clean.txt, data.txt (wikidata kg)
print("Avg. degree:", np.array(list(degrees.values())).mean())

In [None]:
def kg_size(file_name):
    E = set() # entities
    R = set() # relations
    with open(file_name) as file:
        #data = file.readlines()
        for triple in file:
            triple = triple.strip('\t.\n').split()
           
            e1 = triple[0].lstrip('<').rstrip('>')# db
            r = triple[1].lstrip('<').rstrip('>')# db
            e2 = triple[2].lstrip('<').rstrip('>')# db
            
            E.update({e1, e2})
            R.add(r)
    print(f"#Entities: {len(E)}, #Relations: {len(R)}")
kg_size('data.txt')
kg_size('mergeentities.txt')