Copyright (c) 2023 Graphcore Ltd. All rights reserved.

# PrimeKG

## Triple pre-processing

In [1]:
import requests
import numpy as np
import pandas as pd
import random
import pickle
from pathlib import Path
import torch

In [3]:
URL = "https://dataverse.harvard.edu/api/access/datafile/6180617"

res = requests.get(url=URL)
with open("../data/primekg/nodes.csv", "wb") as output:
    output.write(res.content)

In [4]:
URL = "https://dataverse.harvard.edu/api/access/datafile/6180620"

res = requests.get(url=URL)
with open("../data/primekg/edges.csv", "wb") as output:
    output.write(res.content)

In [2]:
# load the PrimeKG edge and node files
raw_edge_df = pd.read_csv("../data/primekg/edges.csv", low_memory=False)
raw_node_df = pd.read_csv("../data/primekg/nodes.csv", delimiter="\t")

In [3]:
# generate random node ids as the node ids provided are not unique
random.seed(42)
raw_node_df["node_id"] = [
    random.randint(1, 1000000000000) for _ in range(len(raw_node_df))
]

modified_node_df = pd.DataFrame()
modified_node_df["uuid"] = raw_node_df["node_id"]
modified_node_df["type"] = raw_node_df["node_type"]
modified_node_df["name"] = raw_node_df["node_name"]
modified_node_df["id"] = raw_node_df["node_index"]

modified_node_df = modified_node_df.astype({"uuid": "str"})

In [4]:
print("num unique node_index:", raw_node_df.node_index.nunique())
print("num unique node_id:", raw_node_df.node_id.nunique())
print("num unique node_name:", raw_node_df.node_name.nunique())

num unique node_index: 129375
num unique node_id: 129375
num unique node_name: 129262


In [5]:
df_source = pd.merge(
    raw_edge_df, raw_node_df, how="left", left_on="x_index", right_on="node_index"
)
df_target = pd.merge(
    raw_edge_df, raw_node_df, how="left", left_on="y_index", right_on="node_index"
)

In [6]:
concat_df = pd.concat([raw_edge_df["x_index"], raw_edge_df["y_index"]])
print("PrimeKG unique index:", concat_df.nunique())

concat_df = pd.concat([raw_edge_df["x_id"], raw_edge_df["y_id"]])
print("PrimeKG unique id:", concat_df.nunique())

concat_df = pd.concat([raw_edge_df["x_name"], raw_edge_df["y_name"]])
print("PrimeKG unique name:", concat_df.nunique())

PrimeKG unique index: 129375
PrimeKG unique id: 90067
PrimeKG unique name: 129262


In [7]:
edge_df = pd.DataFrame()

edge_df["source"] = df_source["node_id"]
edge_df["target"] = df_target["node_id"]

edge_df["source_type"] = raw_edge_df["x_type"]
edge_df["target_type"] = raw_edge_df["y_type"]

edge_df["label"] = raw_edge_df["relation"]

edge_df["source_id"] = raw_edge_df["x_index"]
edge_df["target_id"] = raw_edge_df["y_index"]

edge_df["source_name"] = raw_edge_df["x_name"]
edge_df["target_name"] = raw_edge_df["y_name"]

edge_df["label_id"] = edge_df.groupby("label").ngroup(ascending=False)

In [8]:
label_list = edge_df.label.unique().tolist()
label_list_split = [label.split("_")[:2] for label in label_list]

As in PrimeKG all triples are treated as undirected, we discard one direction of each edge.

In [9]:
modified_edge_df = pd.DataFrame()

for idx, label_split in enumerate(label_list_split):
    label_temp = label_list[idx]
    if len(label_split) == 2:
        if label_split[0] != label_split[1]:
            temp_df = edge_df[
                (edge_df.label == label_temp)
                & (edge_df.source_type.str.contains(label_split[0]))
            ]
            if idx == 0:
                modified_edge_df = temp_df.copy()
            else:
                modified_edge_df = pd.concat([modified_edge_df, temp_df])

In [10]:
modified_edge_df = pd.concat(
    [
        edge_df[
            (edge_df.label == "molfunc_protein")
            & (edge_df.source_type == "molecular_function")
        ],
        modified_edge_df,
    ]
)
modified_edge_df = pd.concat(
    [
        edge_df[
            (edge_df.label == "cellcomp_protein")
            & (edge_df.source_type == "cellular_component")
        ],
        modified_edge_df,
    ]
)
modified_edge_df = pd.concat(
    [
        edge_df[
            (edge_df.label == "bioprocess_protein")
            & (edge_df.source_type == "biological_process")
        ],
        modified_edge_df,
    ]
)

In [11]:
for idx, relation in enumerate(label_list):
    df_relation = edge_df[edge_df.label == relation]

    if len(label_list_split[idx]) > 1:
        df_swapped = df_relation.copy()[["target_id", "source_id"]]
        df_swapped.columns = ["source_id", "target_id"]

        # Concatenate the original dataframe and the swapped dataframe
        df_concat = pd.concat([df_relation, df_swapped])

        # Drop duplicate rows and keep the first occurrence of each unique edge
        df_deduplicated = df_concat.drop_duplicates(subset=["source_id", "target_id"])

        # Remove edges that were reversed during deduplication
        mask = df_deduplicated["source_id"] < df_deduplicated["target_id"]
        df_final = df_deduplicated[mask]

        # Reset the index and return the final dataframe
        df_final.reset_index(drop=True)

        if label_list_split[idx][0] == label_list_split[idx][1]:
            modified_edge_df = pd.concat([modified_edge_df, df_final])

In [12]:
modified_edge_df = pd.concat(
    [
        modified_edge_df,
        edge_df[
            (edge_df.label == "contraindication") & (edge_df.source_type == "drug")
        ],
    ]
)
modified_edge_df = pd.concat(
    [
        modified_edge_df,
        edge_df[(edge_df.label == "indication") & (edge_df.source_type == "drug")],
    ]
)
modified_edge_df = pd.concat(
    [
        modified_edge_df,
        edge_df[(edge_df.label == "off-label use") & (edge_df.source_type == "drug")],
    ]
)

In [13]:
modified_edge_df = modified_edge_df.drop_duplicates()

modified_edge_df = modified_edge_df.astype({"label_id": "int"})
modified_edge_df = modified_edge_df.astype({"source": "str"})
modified_edge_df = modified_edge_df.astype({"target": "str"})

In [14]:
modified_edge_df

Unnamed: 0,source,target,source_type,target_type,label,source_id,target_id,source_name,target_name,label_id
6351194,427809572530.0,47962102762.0,biological_process,gene/protein,bioprocess_protein,112487,7097,neutrophil degranulation,A1BG,25
6351195,427809572530.0,811505526209.0,biological_process,gene/protein,bioprocess_protein,112487,6931,neutrophil degranulation,SERPINA3,25
6351196,427809572530.0,84419768413.0,biological_process,gene/protein,bioprocess_protein,112487,1114,neutrophil degranulation,AOC1,25
6351197,427809572530.0,571566665436.0,biological_process,gene/protein,bioprocess_protein,112487,4852,neutrophil degranulation,ACAA1,25
6351198,427809572530.0,924927297102.0,biological_process,gene/protein,bioprocess_protein,112487,6561,neutrophil degranulation,ACLY,25
...,...,...,...,...,...,...,...,...,...,...
389266,643646437093.0,423516769392.0,drug,disease,off-label use,14286,84318,Rifampicin,paucibacillary leprosy,5
389289,314216603150.0,467565167408.0,drug,disease,off-label use,20454,38011,Mupirocin,epidermolysis bullosa,5
389290,314216603150.0,950677384996.0,drug,disease,off-label use,20454,27434,Mupirocin,epidermolysis bullosa simplex,5
389291,314216603150.0,159662652330.0,drug,disease,off-label use,20454,84323,Mupirocin,ecthyma,5


In [15]:
# generate triples from the PrimeKG dataframe
triples = modified_edge_df[["source_id", "label_id", "target_id"]].values

# sort triples according to head id
sorted_triples = triples[triples[:, 0].argsort()]

In [16]:
modified_edge_df.label.unique()

array(['bioprocess_protein', 'cellcomp_protein', 'molfunc_protein',
       'drug_protein', 'phenotype_protein', 'disease_phenotype_negative',
       'disease_phenotype_positive', 'disease_protein', 'drug_effect',
       'exposure_protein', 'exposure_disease', 'exposure_bioprocess',
       'exposure_molfunc', 'exposure_cellcomp', 'pathway_protein',
       'anatomy_protein_present', 'anatomy_protein_absent',
       'protein_protein', 'drug_drug', 'phenotype_phenotype',
       'disease_disease', 'bioprocess_bioprocess', 'molfunc_molfunc',
       'cellcomp_cellcomp', 'exposure_exposure', 'pathway_pathway',
       'anatomy_anatomy', 'contraindication', 'indication',
       'off-label use'], dtype=object)

In [35]:
part1 = modified_edge_df[["source_id", "source_type"]]
part2 = modified_edge_df[["target_id", "target_type"]]
new_cols = ["ent_id", "ent_type"]
part1.columns = new_cols
part2.columns = new_cols
# Aggregated head and tail entities
ent_all = pd.concat([part1, part2], ignore_index=True)
ent_all

Unnamed: 0,ent_id,ent_type
0,112487,biological_process
1,112487,biological_process
2,112487,biological_process
3,112487,biological_process
4,112487,biological_process
...,...,...
8100123,84318,disease
8100124,38011,disease
8100125,27434,disease
8100126,84323,disease


In [42]:
uniq_ent = (ent_all.groupby("ent_id")[["ent_id", "ent_type"]].first())["ent_type"]
uniq_ent

ent_id
0         gene/protein
1         gene/protein
2         gene/protein
3         gene/protein
4         gene/protein
              ...     
129370         pathway
129371         pathway
129372         anatomy
129373         anatomy
129374         anatomy
Name: ent_type, Length: 129375, dtype: object

In [46]:
ent_type, type_id = np.unique(uniq_ent, return_inverse=True)

In [47]:
ent_type

array(['anatomy', 'biological_process', 'cellular_component', 'disease',
       'drug', 'effect/phenotype', 'exposure', 'gene/protein',
       'molecular_function', 'pathway'], dtype=object)

In [50]:
data_path = Path("../data/primekg")
with open(data_path.joinpath("type_ids.pkl"), "wb") as f:
    pickle.dump(uniq_ent.values, f)

In [67]:
# rel_dict, rel_id = np.unique(modified_edge_df["label"].values, return_inverse=True)
rel_dict = modified_edge_df.groupby("label_id")["label"].first().values
with open("../data/primekg/relation_dict.pkl", "wb") as f:
    pickle.dump(rel_dict.tolist(), f)

In [None]:
uniq_ent = modified_node_df.groupby("id")[["type", "name"]].first().sort_values("type")
ent_dict = (uniq_ent["name"] + " (" + uniq_ent["type"] + ")").values.tolist()
ent_dict[:20]

['splenius (anatomy)',
 'lamina I of gray matter of spinal cord (anatomy)',
 'accessory nerve fiber bundle (anatomy)',
 'vagal nerve fiber bundle (anatomy)',
 'posterior column of fornix (anatomy)',
 'lateral occipital cortex (anatomy)',
 'corticomedial nuclear complex (anatomy)',
 'basolateral amygdaloid nuclear complex (anatomy)',
 'cochlear canal (anatomy)',
 'Brodmann (1909) area 36 (anatomy)',
 'Brodmann (1909) area 35 (anatomy)',
 'Brodmann (1909) area 24 (anatomy)',
 'Brodmann (1909) area 3 (anatomy)',
 'Brodmann (1909) area 1 (anatomy)',
 'basal nuclear complex (anatomy)',
 'ventral external arcuate fiber bundle (anatomy)',
 'posterior transverse temporal area 42 (anatomy)',
 'anterior transverse temporal area 41 (anatomy)',
 'superior parietal cortex (anatomy)',
 'precuneus cortex (anatomy)']