Copyright (c) 2023 Graphcore Ltd. All rights reserved.

# FB15k-237

## Triple pre-processing


Install dependencies

In [None]:
!pip install -q wheel
!pip install -q git+https://github.com/graphcore-research/bess-kge.git

In [1]:
import pickle
import zipfile
import requests
from io import BytesIO
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from besskge.dataset import KGDataset

Download dataset from source

In [2]:
data_path = Path("../data/fb15k-237/")
data_path.mkdir(parents=True, exist_ok=True)

res = requests.get(
    url="https://download.microsoft.com/download/8/7/0/8700516A-AB3D-4850-B4BB-805C515AECE1/FB15K-237.2.zip",
)
with zipfile.ZipFile(BytesIO(res.content)) as zip_f:
    zip_f.extractall(path=data_path)

In [3]:
column_names = ["h_label", "r_label", "t_label"]
train_triples = pd.read_csv(
    data_path.joinpath("Release/train.txt"),
    header=None,
    names=column_names,
    sep="\t",
)
valid_triples = pd.read_csv(
    data_path.joinpath("Release/valid.txt"),
    header=None,
    names=column_names,
    sep="\t",
)
test_triples = pd.read_csv(
    data_path.joinpath("Release/test.txt"),
    header=None,
    names=column_names,
    sep="\t",
)

In [4]:
df_triples = {"train": train_triples, "valid": valid_triples, "test": test_triples}
df_triples["train"]

Unnamed: 0,h_label,r_label,t_label
0,/m/027rn,/location/country/form_of_government,/m/06cx9
1,/m/017dcd,/tv/tv_program/regular_cast./tv/regular_tv_app...,/m/06v8s0
2,/m/07s9rl0,/media_common/netflix_genre/titles,/m/0170z3
3,/m/01sl1q,/award/award_winner/awards_won./award/award_ho...,/m/044mz_
4,/m/0cnk2q,/soccer/football_team/current_roster./sports/s...,/m/02nzb8
...,...,...,...
272110,/m/016mhd,/film/film/release_date_s./film/film_regional_...,/m/029j_
272111,/m/01g888,/music/genre/artists,/m/01vv126
272112,/m/0djvzd,/soccer/football_player/current_team./sports/s...,/m/02b16p
272113,/m/0gmblvq,/award/award_winning_work/awards_won./award/aw...,/m/07kjk7c


We can just use `besskge.dataset.KGDataset.from_dataframe` to preprocess and build the KGDataset.

In [5]:
fb15k_237 = KGDataset.from_dataframe(
    df_triples,
    head_column="h_label",
    relation_column="r_label",
    tail_column="t_label",
)

print(f"Number of entities: {fb15k_237.n_entity:,}\n")
print(f"Number of relation types: {fb15k_237.n_relation_type}\n")

Number of entities: 14,541

Number of relation types: 237



## Save

Save triples and dictionaries.

In [6]:
# Recover and concatenate all triples

all_triples = np.concatenate([trip for trip in fb15k_237.triples.values()], axis=0)
all_triples.shape

(310116, 3)

In [7]:
ent_dict = fb15k_237.entity_dict
rel_dict = fb15k_237.relation_dict
type_offset = fb15k_237.type_offsets

# Sanity check

triple_id = 14500

ent_dict[all_triples[triple_id, 0]], rel_dict[all_triples[triple_id, 1]], ent_dict[
    all_triples[triple_id, 2]
]

('/m/0262x6',
 '/award/award_category/winners./award/award_honor/award_winner',
 '/m/05jm7')

In [8]:
part = df_triples["train"][
    df_triples["train"]["h_label"] == ent_dict[all_triples[triple_id, 0]]
]
part[part["t_label"] == ent_dict[all_triples[triple_id, 2]]]

Unnamed: 0,h_label,r_label,t_label
14500,/m/0262x6,/award/award_category/winners./award/award_hon...,/m/05jm7


In [9]:
torch.save(all_triples, data_path.joinpath("triples.pt"))
with open(data_path.joinpath("entity_dict.pkl"), "wb") as f:
    pickle.dump(ent_dict, f)
with open(data_path.joinpath("relation_dict.pkl"), "wb") as f:
    pickle.dump(rel_dict, f)
with open(data_path.joinpath("type_offset.pkl"), "wb") as f:
    pickle.dump(type_offset, f)