Copyright (c) 2023 Graphcore Ltd. All rights reserved.

# PharmKG

## Triple pre-processing

Install dependencies

In [4]:
!pip install -q wheel
!pip install -q git+https://github.com/graphcore-research/bess-kge.git

In [5]:
import pickle
import urllib.request
import zipfile
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from besskge.dataset import KGDataset

Download dataset from source

In [6]:
data_path = Path("../data/pharmkg")
data_path.mkdir(parents=True, exist_ok=True)
filename = data_path.joinpath("raw_PharmKG-180k.zip")

urllib.request.urlretrieve(
    "https://zenodo.org/record/4077338/files/raw_PharmKG-180k.zip",
    filename=filename,
)
with zipfile.ZipFile(filename, "r") as zip_ref:
    zip_ref.extractall(data_path)

In [7]:
df_triples = pd.read_csv(data_path.joinpath("raw_PharmKG-180k.csv"))
df_triples

Unnamed: 0,Entity1_name,Entity1_type,relationship_type,Entity2_name,Entity2_type,PubMed_ID,Sentence_tokenized
0,htr1a,Gene,Gene-Chemical,sdz mar 327,Chemical,,
1,cyp21a2,gene,H,star,chemical,"26515592.0, 28800627.0, 16835396.0, 22396488.0...","'Additionally , in NCI-H295 cells , PNU-74654 ..."
2,interleukin 1 receptor antagonist protein,gene,L,shock,disease,,
3,bps,chemical,T,hypertension,disease,"9304231.0, 17616452.0, 17616452.0, 24283596.0,...",'According to BP load -LRB- the percentage of ...
4,vitamin k,chemical,T,cystic fibrosis,disease,"nan, nan","nan, nan"
...,...,...,...,...,...,...,...
1093231,amino acid,chemical,O,mstn,gene,"28322766.0, 17541833.0, 28355117.0, 17617213.0...",'FmMstn is most similar to the Mstn of Litopen...
1093232,fe,chemical,T,lupus erythematosus systemic,disease,,
1093233,interleukin 13 receptor subunit alpha 2,gene,L,neoplasms,disease,"nan, nan, nan, nan, nan","nan, nan, nan, nan, nan"
1093234,draxin,gene,Rg,map1b,gene,"25775433.0, 25775433.0, 25775433.0","'This study , for the first time elucidates mo..."


Capitalization in entity types appears inconsistent ("chemical" and "Chemical" are to be treated as the same type).

In [8]:
df_triples["Entity1_type"].unique()

array(['Gene', 'gene', 'chemical', 'Chemical', 'disease', 'Disease'],
      dtype=object)

In [9]:
part1 = df_triples[["Entity1_name", "Entity1_type"]]
part2 = df_triples[["Entity2_name", "Entity2_type"]]
new_cols = ["ent_name", "ent_type"]
part1.columns = new_cols
part2.columns = new_cols
# Aggregated head and tail entities
ent_all = pd.concat([part1, part2], ignore_index=True)
ent_all

Unnamed: 0,ent_name,ent_type
0,htr1a,Gene
1,cyp21a2,gene
2,interleukin 1 receptor antagonist protein,gene
3,bps,chemical
4,vitamin k,chemical
...,...,...
2186467,mstn,gene
2186468,lupus erythematosus systemic,disease
2186469,neoplasms,disease
2186470,map1b,gene


In [10]:
# Discard duplicate entities, randomly taking the entity type
uniq_ent = (ent_all.groupby("ent_name")[["ent_name", "ent_type"]].first())["ent_type"]

# Uniform entity type capitalization
uniq_ent = uniq_ent.str.capitalize()

uniq_ent

ent_name
+                                                    Chemical
-                                                    Chemical
0                                                        Gene
0 dimethyl 0 4 nitrophenyl phosphorothioate          Chemical
1 0 alkyl 2 acetyl sn glycero 3 phosphorylcholine    Chemical
                                                       ...   
zyz 803                                              Chemical
zyz451                                               Chemical
zz                                                   Chemical
zz 122                                               Chemical
zz1 61c                                              Chemical
Name: ent_type, Length: 188296, dtype: object

In [11]:
np.unique(uniq_ent.values)

array(['Chemical', 'Disease', 'Gene'], dtype=object)

No apparent capitalization problems for relation type labels.

In [12]:
df_triples["relationship_type"].unique()

array(['Gene-Chemical', 'H', 'L', 'T', 'B', 'Q', 'J', 'E', 'Z', 'N', 'C',
       'V+', 'Sa', 'Gene-Disease', 'K', 'Rg', 'O', 'Pr', 'E+', 'X', 'I',
       'Md', 'Pa', 'U', 'Te', 'Chemical-Disease', 'Ud', 'D', 'Y', 'E-',
       'G', 'A-', 'Mp', 'A+', 'W', 'Gene-Gene', 'Disease-Disease',
       'Chemical-Chemical', 'T.ind'], dtype=object)

We can just use `besskge.dataset.KGDataset.from_dataframe` to preprocess and build the KGDataset.

In [13]:
pharmkg = KGDataset.from_dataframe(
    df_triples,
    head_column="Entity1_name",
    relation_column="relationship_type",
    tail_column="Entity2_name",
    entity_types=uniq_ent,
)

print(f"Number of entities: {pharmkg.n_entity:,}\n")
print(f"Number of relation types: {pharmkg.n_relation_type}\n")

Number of entities: 188,296

Number of relation types: 39



## Save

Save triples and dictionaries.

In [20]:
# Recover and concatenate all triples

all_triples = np.concatenate([trip for trip in pharmkg.triples.values()], axis=0)
all_triples.shape

(1093236, 3)

In [23]:
ent_dict = pharmkg.entity_dict
rel_dict = pharmkg.relation_dict
type_offset = pharmkg.type_offsets

# Sanity check

triple_id = 14500

ent_dict[all_triples[triple_id, 0]], rel_dict[all_triples[triple_id, 1]], ent_dict[
    all_triples[triple_id, 2]
]

('chloramphenicol acetyl', 'E', 'beta glucuronidase')

In [25]:
df_triples[df_triples["Entity1_name"] == ent_dict[all_triples[triple_id, 0]]]

Unnamed: 0,Entity1_name,Entity1_type,relationship_type,Entity2_name,Entity2_type,PubMed_ID,Sentence_tokenized
68559,chloramphenicol acetyl,chemical,O,cyp1a1,gene,"11226373.0, 9890552.0, 8562336.0, 9890552.0, 8...","'In the present study , we investigated the tr..."
109687,chloramphenicol acetyl,chemical,E,beta glucuronidase,gene,8112307.0,'We describe the construction of a plasmid -LR...
179039,chloramphenicol acetyl,chemical,E,tgf alpha,gene,"8752656.0, 1501890.0, 8752656.0","'A 238-bp fragment and a 123-bp fragment , bot..."
438370,chloramphenicol acetyl,chemical,B,upa,chemical,"1905804.0, 11115541.0, 3205721.0, 11115541.0",'A synthetic copy of this element confers supe...
452295,chloramphenicol acetyl,chemical,Z,cat,chemical,"2102836.0, 3359914.0, 9886828.0, 18601211.0, 2...",'A monoclonal antibody to chloramphenicol_acet...
510064,chloramphenicol acetyl,chemical,Sa,roberts syndrome,disease,,
540366,chloramphenicol acetyl,chemical,C,carcinoma hepatocellular,disease,,
593353,chloramphenicol acetyl,chemical,E,endothelin 1,chemical,,
879806,chloramphenicol acetyl,chemical,T,cat,chemical,"18601211.0, 2102836.0, 8972775.0, 7511098.0, 9...",'An expression system has been designed in whi...
919074,chloramphenicol acetyl,chemical,O,vegfa,gene,,


In [27]:
torch.save(all_triples, data_path.joinpath("triples.pt"))
with open(data_path.joinpath("entity_dict.pkl"), "wb") as f:
    pickle.dump(ent_dict, f)
with open(data_path.joinpath("relation_dict.pkl"), "wb") as f:
    pickle.dump(rel_dict, f)
with open(data_path.joinpath("type_offset.pkl"), "wb") as f:
    pickle.dump(type_offset, f)