# NELL-995 data preprocessing for KG-BERT
#### Written by: Elize BU

In [1]:
import csv
import numpy as np
import pandas as pd

In [2]:
with open('./OpenKE_NELL-995/train2id.txt', 'r') as f:
    lines = f.readlines()[1:]    # skip first line
    train = list()
    for line in lines:
        # process data
        line = line.strip('\n').split(' ')
        train.append(line)
    train2id = pd.DataFrame(train, columns=['head_entity_id','tail_entity_id','relation_id'])
train2id

Unnamed: 0,head_entity_id,tail_entity_id,relation_id
0,0,1,0
1,2,3,1
2,4,3,1
3,4,5,2
4,4,6,2
...,...,...,...
149673,74425,74426,10
149674,74427,44747,10
149675,55394,19051,73
149676,74428,74429,10


In [3]:
with open('./OpenKE_NELL-995/valid2id.txt', 'r') as f:
    lines = f.readlines()[1:]    # skip first line
    val = list()
    for line in lines:
        # process data
        line = line.strip('\n').split(' ')
        val.append(line)
    valid2id = pd.DataFrame(val, columns=['head_entity_id','tail_entity_id','relation_id'])
valid2id

Unnamed: 0,head_entity_id,tail_entity_id,relation_id
0,8224,6036,96
1,55254,8405,155
2,7697,6426,97
3,7047,6076,97
4,8663,6033,91
...,...,...,...
538,6772,6380,99
539,74535,1552,25
540,53380,6072,91
541,34643,2294,25


In [4]:
with open('./OpenKE_NELL-995/test2id.txt', 'r') as f:
    lines = f.readlines()[1:]    # skip first line
    test = list()
    for line in lines:
        # process data
        line = line.strip('\n').split(' ')
        test.append(line)
    test2id = pd.DataFrame(test, columns=['head_entity_id','tail_entity_id','relation_id'])
test2id

Unnamed: 0,head_entity_id,tail_entity_id,relation_id
0,13933,13932,27
1,10977,10975,27
2,11620,74536,27
3,20868,28290,27
4,74537,74538,27
...,...,...,...
3987,29716,3188,126
3988,20609,3188,126
3989,29280,14617,126
3990,14793,14521,126


In [5]:
with open('./OpenKE_NELL-995/relation2id.txt', 'r') as f:
    lines = f.readlines()[1:]    # skip first line
    rel = list()
    for line in lines:
        # process data
        line = line.strip('\n').split('\t')
        rel.append(line)
    relation2id = pd.DataFrame(rel, columns=['relation','id'])
relation2id

Unnamed: 0,relation,id
0,concept:agriculturalproductgrowinginstateorpro...,62
1,concept:motherofperson,106
2,concept:teamalsoknownas,156
3,concept:proxyfor,1
4,concept:newspaperincity,151
...,...,...
195,concept:weaponmadeincountry,199
196,concept:specializationof,59
197,concept:thinghascolor,53
198,concept:professionusestool,171


In [6]:
# subtitute relation_id to true relation in train2id
for i in range(len(relation2id)):
    for j in range(len(train2id)):
        if train2id['relation_id'][j] == relation2id['id'][i]:
            train2id['relation_id'][j] = relation2id['relation'][i]
train2id

Unnamed: 0,head_entity_id,tail_entity_id,relation_id
0,0,1,concept:academicprogramatuniversity
1,2,3,concept:proxyfor
2,4,3,concept:proxyfor
3,4,5,concept:atdate
4,4,6,concept:atdate
...,...,...,...
149673,74425,74426,concept:latitudelongitude
149674,74427,44747,concept:latitudelongitude
149675,55394,19051,concept:locationlocatedwithinlocation
149676,74428,74429,concept:latitudelongitude


In [7]:
# subtitute relation_id to true relation in valid2id
for i in range(len(relation2id)):
    for j in range(len(valid2id)):
        if valid2id['relation_id'][j] == relation2id['id'][i]:
            valid2id['relation_id'][j] = relation2id['relation'][i]

valid2id

Unnamed: 0,head_entity_id,tail_entity_id,relation_id
0,8224,6036,concept:athleteplaysinleague
1,55254,8405,concept:teamplaysinleague
2,7697,6426,concept:athleteplaysforteam
3,7047,6076,concept:athleteplaysforteam
4,8663,6033,concept:athleteplayssport
...,...,...,...
538,6772,6380,concept:athletehomestadium
539,74535,1552,concept:worksfor
540,53380,6072,concept:athleteplayssport
541,34643,2294,concept:worksfor


In [8]:
# subtitute relation_id to true relation in test2id
for i in range(len(relation2id)):
    for j in range(len(test2id)):
        if test2id['relation_id'][j] == relation2id['id'][i]:
            test2id['relation_id'][j] = relation2id['relation'][i]
test2id

Unnamed: 0,head_entity_id,tail_entity_id,relation_id
0,13933,13932,concept:personleadsorganization
1,10977,10975,concept:personleadsorganization
2,11620,74536,concept:personleadsorganization
3,20868,28290,concept:personleadsorganization
4,74537,74538,concept:personleadsorganization
...,...,...,...
3987,29716,3188,concept:organizationheadquarteredincity
3988,20609,3188,concept:organizationheadquarteredincity
3989,29280,14617,concept:organizationheadquarteredincity
3990,14793,14521,concept:organizationheadquarteredincity


### train.tsv, dev.tsv, test.tsv

In [9]:
# change the order of attributes
order = ['head_entity_id', 'relation_id', 'tail_entity_id']
train2id = train2id[order]
valid2id = valid2id[order]
test2id = test2id[order]

In [10]:
# Write tsv files, split by '\t'

# Write train.tsv
path='./output/train.csv'
train2id.to_csv(path,sep=',',index=False,header=False)

with open('./output/train.csv') as f:
    data = f.read().replace(',', '\t')
with open('./output/train.tsv','w') as f:
    f.write(data)

In [11]:
# Write tsv files, split by '\t'

# Write dev.tsv
path='./output/dev.csv'
valid2id.to_csv(path,sep=',',index=False,header=False)

with open('./output/dev.csv') as f:
    data = f.read().replace(',', '\t')
with open('./output/dev.tsv','w') as f:
    f.write(data)

In [12]:
# Write tsv files, split by '\t'

# Write test.tsv
path='./output/test.csv'
test2id.to_csv(path,sep=',',index=False,header=False)

with open('./output/test.csv') as f:
    data = f.read().replace(',', '\t')
with open('./output/test.tsv','w') as f:
    f.write(data)

### entities.txt

In [13]:
with open('./OpenKE_NELL-995/entity2id.txt', 'r') as f:
    lines = f.readlines()[1:]    # skip first line
    ent = list()
    for line in lines:
        # process data
        line = line.strip('\n').split('\t')
        ent.append(line)
    entity2id = pd.DataFrame(ent, columns=['entity','id'])
entity2id

Unnamed: 0,entity,id
0,concept_personcanada_matthew_bellamy,53990
1,concept_visualizablescene_ottawa_gatineau,70655
2,concept_physicalaction_formation,56854
3,concept_personcanada_johnny_carson,53948
4,concept_book_in_the_heart_of_the_country,73461
...,...,...
75487,concept_movie_united_93,1713
75488,concept_organization_faculty_of_law,49323
75489,concept_sportsteam_kennesaw_state_owls,55281
75490,concept_date_mid_june,47190


In [14]:
entities_id = entity2id.drop(columns=['entity'])
entities_id

Unnamed: 0,id
0,53990
1,70655
2,56854
3,53948
4,73461
...,...
75487,1713
75488,49323
75489,55281
75490,47190


In [15]:
entities_id.to_csv('./output/entities.txt',sep='\n',index=False, header=False)

### relations.txt

In [16]:
relation2id

Unnamed: 0,relation,id
0,concept:agriculturalproductgrowinginstateorpro...,62
1,concept:motherofperson,106
2,concept:teamalsoknownas,156
3,concept:proxyfor,1
4,concept:newspaperincity,151
...,...,...
195,concept:weaponmadeincountry,199
196,concept:specializationof,59
197,concept:thinghascolor,53
198,concept:professionusestool,171


In [17]:
relations = relation2id.drop(columns=['id'])
relations

Unnamed: 0,relation
0,concept:agriculturalproductgrowinginstateorpro...
1,concept:motherofperson
2,concept:teamalsoknownas
3,concept:proxyfor
4,concept:newspaperincity
...,...
195,concept:weaponmadeincountry
196,concept:specializationof
197,concept:thinghascolor
198,concept:professionusestool


In [18]:
relations.to_csv('./output/relations.txt',sep='\n',index=False, header=False)

### entity2text.txt

In [19]:
entity2id

Unnamed: 0,entity,id
0,concept_personcanada_matthew_bellamy,53990
1,concept_visualizablescene_ottawa_gatineau,70655
2,concept_physicalaction_formation,56854
3,concept_personcanada_johnny_carson,53948
4,concept_book_in_the_heart_of_the_country,73461
...,...,...
75487,concept_movie_united_93,1713
75488,concept_organization_faculty_of_law,49323
75489,concept_sportsteam_kennesaw_state_owls,55281
75490,concept_date_mid_june,47190


In [20]:
# Change the order of attributes
order = ['id', 'entity']
entitytext = entity2id[order]
entitytext

Unnamed: 0,id,entity
0,53990,concept_personcanada_matthew_bellamy
1,70655,concept_visualizablescene_ottawa_gatineau
2,56854,concept_physicalaction_formation
3,53948,concept_personcanada_johnny_carson
4,73461,concept_book_in_the_heart_of_the_country
...,...,...
75487,1713,concept_movie_united_93
75488,49323,concept_organization_faculty_of_law
75489,55281,concept_sportsteam_kennesaw_state_owls
75490,47190,concept_date_mid_june


In [21]:
entitylist = entitytext.values.tolist()

In [22]:
file_write_obj = open("./output/entity2text.txt", 'w')
for entity in entitylist:
    entityid = entity[0] +'\t'
    entityname = entity[1]
    
    entitynewpair = entityid + entityname
    file_write_obj.writelines(entitynewpair)
    file_write_obj.write('\n')
file_write_obj.close()