In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split


## Generate ID1 and TD2 datasets

In [None]:
!unzip spo_triples_iter1.zip

Archive:  spo_triples_iter1.zip
  inflating: spo_triples_iter1.csv   


In [None]:
# load all matched s-p-o triples
data = pd.read_csv('spo_triples_iter1.zip')
data.head()

Unnamed: 0,subject,predicate,object
0,wkg:656260805,wkgs:addrPlace,wkg:1586413209
1,wkg:8423662676,wkgs:addrPlace,wkg:362658024
2,wkg:3722193229,wkgs:addrPlace,wkg:3009719004
3,wkg:2995637431,wkgs:addrPlace,wkg:3009701150
4,wkg:4063046696,wkgs:addrPlace,wkg:31345707


In [None]:
data.shape

(392730, 3)

In [None]:
data['predicate'].value_counts()

wkgs:isInCountry        247521
wkgs:isInContinent       73965
wkgs:isIn                31935
wkgs:addrPlace           22186
wkgs:country              7460
wkgs:addrState            3104
wkgs:isInCounty           3066
wkgs:addrSuburb           2284
wkgs:addrProvince          797
wkgs:capitalCity           166
wkgs:addrSubdistrict       104
wkgs:addrCountry            52
wkgs:addrDistrict           48
wkgs:addrHamlet             38
Name: predicate, dtype: int64

In [None]:
# check each predicate value_counts
data[data['predicate'] == 'wkgs:isInContinent']['object'].value_counts()

wkg:36966065     46397
wkg:36966057     17592
wkg:36966069      9641
wkg:36966063       296
wkg:36966060        22
wkg:249399679       17
Name: object, dtype: int64

## WikiData matched triples

In [None]:
wd_data = pd.read_csv('wikidata_matched_triples.zip')
wd_data.shape

(172394, 3)

In [None]:
wd_data

Unnamed: 0,subject,predicate,object
0,wkg:6595803911,wkgs:addrCountry,wkg:424317935
1,wkg:663103163,wkgs:addrCountry,wkg:424317935
2,wkg:1438239658,wkgs:addrCountry,wkg:424314830
3,wkg:1438239675,wkgs:addrCountry,wkg:424314830
4,wkg:1438239686,wkgs:addrCountry,wkg:424314830
...,...,...,...
172389,wkg:594445489,wkgs:isIn,wkg:304951006
172390,wkg:594664488,wkgs:isIn,wkg:304951006
172391,wkg:59628055,wkgs:isIn,wkg:304951009
172392,wkg:59633024,wkgs:isIn,wkg:304951009


In [None]:
wd_data['predicate'].value_counts()

wkgs:addrCountry      101171
wkgs:isInCountry       47312
wkgs:isIn              20101
wkgs:country            2877
wkgs:addrPlace           449
wkgs:isInContinent       261
wkgs:capitalCity         223
Name: predicate, dtype: int64

## Merging WKG and Wikidata to create stratified split

In [None]:
wkg_data = pd.read_csv('spo_triples_iter1_cleaned.zip')
wkg_data.shape

(392726, 3)

In [None]:
wkg_data['predicate'].value_counts()

wkgs:isInCountry        247521
wkgs:isInContinent       73965
wkgs:isIn                31935
wkgs:addrPlace           22186
wkgs:country              7460
wkgs:addrState            3104
wkgs:isInCounty           3066
wkgs:addrSuburb           2284
wkgs:addrProvince          797
wkgs:capitalCity           166
wkgs:addrSubdistrict       104
wkgs:addrCountry            52
wkgs:addrDistrict           48
wkgs:addrHamlet             38
Name: predicate, dtype: int64

In [None]:
wd_data = pd.read_csv('wikidata_matched_triples.zip')
wd_data.shape

(172394, 3)

In [None]:
wd_data['predicate'].value_counts()

wkgs:addrCountry      101171
wkgs:isInCountry       47312
wkgs:isIn              20101
wkgs:country            2877
wkgs:addrPlace           449
wkgs:isInContinent       261
wkgs:capitalCity         223
Name: predicate, dtype: int64

In [None]:
130402/(392726 + 130402)

0.24927360034255477

In [None]:
# wikidata matched which are not present in worldkg matches
wd_left_join_result = wd_data.merge(wkg_data.drop_duplicates(), on = ['subject', 'predicate', 'object'], 
                   how = 'left', indicator = True)

In [None]:
# triples in wikidata match which are not present in worldkg
unique_wd_data = wd_data[wd_left_join_result['_merge'] == 'left_only']
unique_wd_data.shape

(130402, 3)

In [None]:
unique_wd_data['predicate'].value_counts()

wkgs:addrCountry      101171
wkgs:isIn              20068
wkgs:isInCountry        5750
wkgs:country            2843
wkgs:addrPlace           448
wkgs:capitalCity          84
wkgs:isInContinent        38
Name: predicate, dtype: int64

In [None]:
# extract 10% entries from wd_data for predicate addrCountry to put in training/val data
wd_addrCountry_rows = unique_wd_data[unique_wd_data['predicate'] == 'wkgs:addrCountry']

wd_remaining_addrCountry, wd_addrCountry_wkg_merge = train_test_split(wd_addrCountry_rows, test_size = 0.1, random_state = 0)

In [None]:
not_wd_addrCountry_rows = unique_wd_data[unique_wd_data['predicate'] != 'wkgs:addrCountry']

unique_wd_data_2 = pd.concat([not_wd_addrCountry_rows, wd_remaining_addrCountry], ignore_index = True)
unique_wd_data_2.reset_index(drop = True)

Unnamed: 0,subject,predicate,object
0,wkg:8727785383,wkgs:country,wkg:424317935
1,wkg:8728315675,wkgs:country,wkg:424317935
2,wkg:8728472554,wkgs:country,wkg:424317935
3,wkg:8728472555,wkgs:country,wkg:424317935
4,wkg:8728700807,wkgs:country,wkg:424313760
...,...,...,...
120279,wkg:1135414237,wkgs:addrCountry,wkg:424314830
120280,wkg:1197106724,wkgs:addrCountry,wkg:424314830
120281,wkg:945163900,wkgs:addrCountry,wkg:424314830
120282,wkg:961947169,wkgs:addrCountry,wkg:424314830


In [None]:
unique_wd_data_2.shape

(120284, 3)

In [None]:
unique_wd_data_2['predicate'].value_counts()

wkgs:addrCountry      91053
wkgs:isIn             20068
wkgs:isInCountry       5750
wkgs:country           2843
wkgs:addrPlace          448
wkgs:capitalCity         84
wkgs:isInContinent       38
Name: predicate, dtype: int64

In [None]:
wd_addrCountry_wkg_merge.shape

(10118, 3)

In [None]:
wkg_data.shape

(392726, 3)

In [None]:
wkg_data_new_predicates = wkg_data[~wkg_data['predicate'].isin(['wkgs:addrCountry', 'wkgs:isIn', 'wkgs:country', 'wkgs:addrPlace',
                                      'wkgs:capitalCity'])]

wkg_remaining_predicates, wd_merge_predicates = train_test_split(wkg_data_new_predicates, stratify = wkg_data_new_predicates['predicate'], test_size = 0.12, random_state = 0)

In [None]:
wkg_data_old_predicates = wkg_data[wkg_data['predicate'].isin(['wkgs:addrCountry', 'wkgs:isIn', 'wkgs:country', 'wkgs:addrPlace',
                                      'wkgs:capitalCity'])]

In [None]:
wkg_remaining_predicates['predicate'].value_counts()

wkgs:isInCountry        217818
wkgs:isInContinent       65089
wkgs:addrState            2732
wkgs:isInCounty           2698
wkgs:addrSuburb           2010
wkgs:addrProvince          701
wkgs:addrSubdistrict        92
wkgs:addrDistrict           42
wkgs:addrHamlet             33
Name: predicate, dtype: int64

In [None]:
# Final merging of recepctive predicates in wikidata and wkg
wkg_final_data = pd.concat([wkg_remaining_predicates, wkg_data_old_predicates, wd_addrCountry_wkg_merge], ignore_index = True)
wkg_final_data.reset_index(drop = True)

wd_final_data = pd.concat([wd_merge_predicates, unique_wd_data_2], ignore_index = True)
wd_final_data.reset_index(drop = True)

Unnamed: 0,subject,predicate,object
0,wkg:4426850370,wkgs:isInCountry,wkg:424298311
1,wkg:1923575883,wkgs:isInContinent,wkg:36966065
2,wkg:1308683858,wkgs:isInCountry,wkg:6677037562
3,wkg:1308609726,wkgs:isInCountry,wkg:6677037562
4,wkg:3068656670,wkgs:isInCountry,wkg:424313730
...,...,...,...
159991,wkg:1135414237,wkgs:addrCountry,wkg:424314830
159992,wkg:1197106724,wkgs:addrCountry,wkg:424314830
159993,wkg:945163900,wkgs:addrCountry,wkg:424314830
159994,wkg:961947169,wkgs:addrCountry,wkg:424314830


In [None]:
wkg_final_data.shape

(363132, 3)

In [None]:
wd_final_data.shape

(159996, 3)

In [None]:
wd_final_data['predicate'].value_counts()

wkgs:addrCountry        91053
wkgs:isInCountry        35453
wkgs:isIn               20068
wkgs:isInContinent       8914
wkgs:country             2843
wkgs:addrPlace            448
wkgs:addrState            372
wkgs:isInCounty           368
wkgs:addrSuburb           274
wkgs:addrProvince          96
wkgs:capitalCity           84
wkgs:addrSubdistrict       12
wkgs:addrDistrict           6
wkgs:addrHamlet             5
Name: predicate, dtype: int64

In [None]:
wkg_final_data['predicate'].value_counts()

wkgs:isInCountry        217818
wkgs:isInContinent       65089
wkgs:isIn                31935
wkgs:addrPlace           22186
wkgs:addrCountry         10170
wkgs:country              7460
wkgs:addrState            2732
wkgs:isInCounty           2698
wkgs:addrSuburb           2010
wkgs:addrProvince          701
wkgs:capitalCity           166
wkgs:addrSubdistrict        92
wkgs:addrDistrict           42
wkgs:addrHamlet             33
Name: predicate, dtype: int64

In [None]:
wkg_final_data[wkg_final_data['predicate'] == 'wkgs:addrHamlet']['object'].value_counts()

wkg:1656941148    7
wkg:332919393     5
wkg:332919807     4
wkg:338530778     4
wkg:2250235979    2
wkg:5647390271    2
wkg:1988949569    1
wkg:2813257804    1
wkg:278747172     1
wkg:1727107318    1
wkg:3478883571    1
wkg:845838237     1
wkg:598328860     1
wkg:3424289501    1
wkg:6113866143    1
Name: object, dtype: int64

In [None]:
wd_final_data[wd_final_data['predicate'] == 'wkgs:addrHamlet']['object'].value_counts()

wkg:338530778     1
wkg:1656941148    1
wkg:1727107318    1
wkg:5647390271    1
wkg:3255267706    1
Name: object, dtype: int64

In [None]:
# split training set into train and validation
stratify_values = wkg_final_data['predicate']
train_triples, validation_triples = train_test_split(wkg_final_data, stratify = stratify_values, test_size = 0.11, random_state = 0)

In [None]:
train_triples.reset_index(drop = True)

Unnamed: 0,subject,predicate,object
0,wkg:501484122,wkgs:isInCountry,wkg:424316663
1,wkg:1308600217,wkgs:isInCountry,wkg:6677037562
2,wkg:2016097408,wkgs:isInCountry,wkg:248120384
3,wkg:1070163307,wkgs:isInCountry,wkg:6677037562
4,wkg:309872497,wkgs:isIn,wkg:424298326
...,...,...,...
323182,wkg:1070187198,wkgs:isInCountry,wkg:6677037562
323183,wkg:501520502,wkgs:isInCountry,wkg:424316663
323184,wkg:1308321070,wkgs:isInCountry,wkg:6677037562
323185,wkg:1070158119,wkgs:isInCountry,wkg:6677037562


In [None]:
validation_triples.reset_index(drop = True)

Unnamed: 0,subject,predicate,object
0,wkg:275386081,wkgs:addrCountry,wkg:249399300
1,wkg:588051169,wkgs:isInCountry,wkg:432425064
2,wkg:1260516834,wkgs:isIn,wkg:424298326
3,wkg:3220274002,wkgs:isInCountry,wkg:424313730
4,wkg:243042169,wkgs:addrCountry,wkg:249399300
...,...,...,...
39940,wkg:931541074,wkgs:isInCountry,wkg:6677037562
39941,wkg:29621202,wkgs:isInCountry,wkg:1683325355
39942,wkg:501484841,wkgs:isInCountry,wkg:424316663
39943,wkg:501529793,wkgs:isInCountry,wkg:424316663


In [None]:
test_triples = wd_final_data
test_triples.reset_index(drop = True)

Unnamed: 0,subject,predicate,object
0,wkg:4426850370,wkgs:isInCountry,wkg:424298311
1,wkg:1923575883,wkgs:isInContinent,wkg:36966065
2,wkg:1308683858,wkgs:isInCountry,wkg:6677037562
3,wkg:1308609726,wkgs:isInCountry,wkg:6677037562
4,wkg:3068656670,wkgs:isInCountry,wkg:424313730
...,...,...,...
159991,wkg:1135414237,wkgs:addrCountry,wkg:424314830
159992,wkg:1197106724,wkgs:addrCountry,wkg:424314830
159993,wkg:945163900,wkgs:addrCountry,wkg:424314830
159994,wkg:961947169,wkgs:addrCountry,wkg:424314830


In [None]:
# checking whether train and test splits are disjoint
left_join_result = test_triples.merge(train_triples.drop_duplicates(), on = ['subject', 'predicate', 'object'], 
                   how = 'left', indicator = True)

test_triples[left_join_result['_merge'] == 'both']

Unnamed: 0,subject,predicate,object


In [None]:
# checking whether val and test splits are disjoint
left_join_result = test_triples.merge(validation_triples.drop_duplicates(), on = ['subject', 'predicate', 'object'], 
                   how = 'left', indicator = True)

test_triples[left_join_result['_merge'] == 'both']

Unnamed: 0,subject,predicate,object


In [None]:
# save final triples
train_triples.to_csv('train_.txt', sep = '\t', index=False, header = False)
test_triples.to_csv('test.txt', sep = '\t', index = False, header = False)
validation_triples.to_csv('valid.txt', sep = '\t', index = False, header = False)

## Generate TD2 dataset

In [None]:
# unique training entities
training_entities, train_entities_counts = np.unique(train_triples[['subject', 'object']].values.flatten(), return_counts = True)

In [None]:
training_entities.shape

(284355,)

In [None]:
# test filtered triples
test_filtered_triples = test_triples[test_triples['subject'].isin(training_entities) & test_triples['object'].isin(training_entities)]
test_filtered_triples.reset_index(drop=True, inplace=True)
test_filtered_triples.shape

(16489, 3)

In [None]:
# val filtered triples
validation_filtered_triples = validation_triples[validation_triples['subject'].isin(training_entities) & validation_triples['object'].isin(training_entities)]
validation_filtered_triples.reset_index(drop=True, inplace=True)
validation_filtered_triples.shape

(10497, 3)

In [None]:
validation_filtered_triples.to_csv('TD2_valid_triples.txt', sep = '\t', index = False, header = False)
test_filtered_triples.to_csv('TD2_test_triples.txt', sep = '\t', index = False, header = False)