# Disease Associates Genes Edge Prediction

This notebook is designed to take the next step moving from predicted sentences to edge predictions. After training the discriminator model, each sentences contains a confidence score for the likelihood of mentioning a relationship. Multiple relationships contain multiple sentences, which makes establishing an edge unintuitive. Is taking the max score appropriate for determining existence of an edge? Does taking the mean of each relationship make more sense? The answer towards these questions are shown below.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from pathlib import Path

import pandas as pd
import plydata as ply
import plydata.tidy as ply_tdy
from sqlalchemy import create_engine

In [2]:
# Set up the environment
username = "danich1"
password = "snorkel"
dbname = "pubmed_central_db"

# Path subject to change for different os
database_str = (
    f"postgresql+psycopg2://{username}:{password}@/{dbname}?host=/var/run/postgresql"
)
conn = create_engine(database_str)

# Disease associates Gene

## Disease and Gene Info URLs for Hetionet

In [3]:
disease_url = "https://raw.githubusercontent.com/dhimmel/disease-ontology/052ffcc960f5897a0575f5feff904ca84b7d2c1d/data/xrefs-prop-slim.tsv"
gene_url = "https://raw.githubusercontent.com/dhimmel/entrez-gene/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv"
dag_url = "https://github.com/dhimmel/integrate/raw/93feba1765fbcd76fd79e22f25121f5399629148/compile/DaG-association.tsv"

# Not used but url for disease up/down regulates gene
# Leaving for future work if someone picks this project up
drg_url = "https://raw.githubusercontent.com/dhimmel/stargeo/08b126cc1f93660d17893c4a3358d3776e35fd84/data/diffex.tsv"

## Load DataFrames for each URL

In [4]:
disease_ontology_df = (
    pd.read_csv(disease_url, sep="\t")
    >> ply.distinct(["doid_code", "doid_name"])
    >> ply.rename(doid_id="doid_code")
    >> ply.define(merge_key=1)
)
disease_ontology_df >> ply.slice_rows(5)

Unnamed: 0,doid_id,doid_name,resource,resource_id,merge_key
0,DOID:2531,hematologic cancer,CSP,2004-1600,1
619,DOID:1319,brain cancer,CSP,2006-2736,1
898,DOID:1324,lung cancer,EFO,0000571,1
1051,DOID:263,kidney cancer,EFO,0000681,1
1193,DOID:1793,pancreatic cancer,EFO,0002618,1


In [5]:
entrez_gene_df = (
    pd.read_csv(gene_url, sep="\t")
    >> ply.rename(dict(entrez_gene_id="GeneID", gene_symbol="Symbol"))
    >> ply.define(merge_key=1)
)
entrez_gene_df >> ply.slice_rows(5)

Unnamed: 0,tax_id,entrez_gene_id,gene_symbol,chromosome,map_location,type_of_gene,description,merge_key
0,9606,1,A1BG,19,19q13.4,protein-coding,alpha-1-B glycoprotein,1
1,9606,2,A2M,12,12p13.31,protein-coding,alpha-2-macroglobulin,1
2,9606,3,A2MP1,12,12p13.31,pseudo,alpha-2-macroglobulin pseudogene 1,1
3,9606,9,NAT1,8,8p22,protein-coding,N-acetyltransferase 1 (arylamine N-acetyltrans...,1
4,9606,10,NAT2,8,8p22,protein-coding,N-acetyltransferase 2 (arylamine N-acetyltrans...,1


In [6]:
disease_gene_map_df = (
    entrez_gene_df
    >> ply.select("entrez_gene_id", "gene_symbol", "merge_key")
    >> ply.inner_join(
        disease_ontology_df >> ply.select("doid_id", "doid_name", "merge_key"),
        on="merge_key",
    )
    >> ply.select("-merge_key")
)
disease_gene_map_df >> ply.slice_rows(5)

Unnamed: 0,entrez_gene_id,gene_symbol,doid_id,doid_name
0,1,A1BG,DOID:2531,hematologic cancer
1,1,A1BG,DOID:1319,brain cancer
2,1,A1BG,DOID:1324,lung cancer
3,1,A1BG,DOID:263,kidney cancer
4,1,A1BG,DOID:1793,pancreatic cancer


In [7]:
hetionet_dag_df = pd.read_csv(
    dag_url, sep="\t", dtype={"entrez_gene_id": int}
) >> ply.define(merge_key=1)
hetionet_dag_df >> ply.slice_rows(5)

Unnamed: 0,doid_id,doid_name,entrez_gene_id,gene_symbol,sources,license,merge_key
0,DOID:2531,Hematologic cancer,25,ABL1,DISEASES|DisGeNET,,1
1,DOID:2531,Hematologic cancer,27,ABL2,DisGeNET,ODbL 1.0,1
2,DOID:2531,Hematologic cancer,54,ACP5,DISEASES,CC BY 4.0,1
3,DOID:2531,Hematologic cancer,113,ADCY7,DisGeNET,ODbL 1.0,1
4,DOID:2531,Hematologic cancer,142,PARP1,DISEASES|DisGeNET,,1


In [8]:
query = """
SELECT "disease_cid" AS doid_id, "gene_cid" AS  entrez_gene_id, count(*) AS n_sentences
FROM disease_gene
GROUP BY "disease_cid", "gene_cid"
"""
disease_gene_sentence_df = (
    pd.read_sql(query, conn)
    >> ply_tdy.separate_rows("entrez_gene_id", sep=";")
    >> ply.call(".astype", {"entrez_gene_id": int})
)
disease_gene_sentence_df >> ply.slice_rows(5)

Unnamed: 0,doid_id,entrez_gene_id,n_sentences
0,DOID:0050156,10014,1
1,DOID:0050156,10046,1
2,DOID:0050156,100507436,1
3,DOID:0050156,100508689,12
4,DOID:0050156,102157402,5


## Merge all dataframes into One

In [9]:
disease_gene_associations_df = (
    disease_gene_map_df
    >> ply.left_join(
        hetionet_dag_df >> ply.select("doid_id", "entrez_gene_id", "sources"),
        on=["doid_id", "entrez_gene_id"],
    )
    >> ply.left_join(disease_gene_sentence_df, on=["doid_id", "entrez_gene_id"])
    >> ply.call(".fillna", {"n_sentences": 0})
    >> ply.call(".astype", {"n_sentences": int})
    >> ply.define(
        hetionet="sources.notnull().astype(int)",
        has_sentence="(n_sentences > 0).astype(int)",
    )
)
(disease_gene_associations_df >> ply.slice_rows(5) >> ply.call(".transpose"))

Unnamed: 0,0,1,2,3,4
entrez_gene_id,1,1,1,1,1
gene_symbol,A1BG,A1BG,A1BG,A1BG,A1BG
doid_id,DOID:2531,DOID:1319,DOID:1324,DOID:263,DOID:1793
doid_name,hematologic cancer,brain cancer,lung cancer,kidney cancer,pancreatic cancer
sources,,,,,
n_sentences,4,0,0,0,0
hetionet,0,0,0,0,0
has_sentence,1,0,0,0,0


In [10]:
outfile = "disease_associates_gene.tsv.xz"
if not Path(outfile).exists():
    (
        disease_gene_associations_df
        >> ply.call(".to_csv", outfile, sep="\t", index=False, compression="xz")
    )

In [11]:
# free memory for rest of notebook
del disease_ontology_df
del entrez_gene_df
del disease_gene_map_df
del disease_gene_sentence_df
del hetionet_dag_df
del disease_gene_associations_df

# Compound treats Disease

## Compound and Disease Info URLs for Hetionet

In [12]:
disease_url = "https://raw.githubusercontent.com/dhimmel/disease-ontology/052ffcc960f5897a0575f5feff904ca84b7d2c1d/data/xrefs-prop-slim.tsv"
compound_url = "https://raw.githubusercontent.com/dhimmel/drugbank/7b94454b14a2fa4bb9387cb3b4b9924619cfbd3e/data/drugbank.tsv"
ctpd_url = "https://raw.githubusercontent.com/dhimmel/indications/11d535ba0884ee56c3cd5756fdfb4985f313bd80/catalog/indications.tsv"

## Load DataFrames for each URL

In [13]:
disease_ontology_df = (
    pd.read_csv(disease_url, sep="\t")
    >> ply.distinct(["doid_code", "doid_name"])
    >> ply.rename(doid_id="doid_code")
    >> ply.define(merge_key=1)
)
disease_ontology_df >> ply.slice_rows(5)

Unnamed: 0,doid_id,doid_name,resource,resource_id,merge_key
0,DOID:2531,hematologic cancer,CSP,2004-1600,1
619,DOID:1319,brain cancer,CSP,2006-2736,1
898,DOID:1324,lung cancer,EFO,0000571,1
1051,DOID:263,kidney cancer,EFO,0000681,1
1193,DOID:1793,pancreatic cancer,EFO,0002618,1


In [14]:
drugbank_df = (
    pd.read_csv(compound_url, sep="\t")
    >> ply.rename(drug_name="name")
    >> ply.define(merge_key=1)
)
drugbank_df >> ply.slice_rows(5)

Unnamed: 0,drugbank_id,drug_name,type,groups,atc_codes,categories,inchikey,inchi,description,merge_key
0,DB00001,Lepirudin,biotech,approved,B01AE02,Antithrombins|Fibrinolytic Agents,,,Lepirudin is identical to natural hirudin exce...,1
1,DB00002,Cetuximab,biotech,approved,L01XC06,Antineoplastic Agents,,,Epidermal growth factor receptor binding FAB. ...,1
2,DB00003,Dornase alfa,biotech,approved,R05CB13,Enzymes,,,Dornase alfa is a biosynthetic form of human d...,1
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,Antineoplastic Agents,,,A recombinant DNA-derived cytotoxic protein co...,1
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,Immunosuppressive Agents,,,Dimeric fusion protein consisting of the extra...,1


In [16]:
compound_disease_map_df = (
    drugbank_df
    >> ply.select("drugbank_id", "drug_name", "merge_key")
    >> ply.inner_join(
        disease_ontology_df >> ply.select("doid_id", "doid_name", "merge_key"),
        on="merge_key",
    )
    >> ply.select("-merge_key")
)
compound_disease_map_df >> ply.slice_rows(5)

Unnamed: 0,drugbank_id,drug_name,doid_id,doid_name
0,DB00001,Lepirudin,DOID:2531,hematologic cancer
1,DB00001,Lepirudin,DOID:1319,brain cancer
2,DB00001,Lepirudin,DOID:1324,lung cancer
3,DB00001,Lepirudin,DOID:263,kidney cancer
4,DB00001,Lepirudin,DOID:1793,pancreatic cancer


In [17]:
hetionet_ctpd_df = (
    pd.read_csv(ctpd_url, sep="\t")
    >> ply.define(sources='"pharmacotherapydb"')
    >> ply.select("-n_curators", "-n_resources")
    >> ply.rename(dict(drug_name="drug", doid_name="disease"))
)
hetionet_ctpd_df >> ply.slice_rows(5)

Unnamed: 0,doid_id,drugbank_id,doid_name,drug_name,category,sources
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM,pharmacotherapydb
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM,pharmacotherapydb
2,DOID:10652,DB01043,Alzheimer's disease,Memantine,DM,pharmacotherapydb
3,DOID:10652,DB00989,Alzheimer's disease,Rivastigmine,DM,pharmacotherapydb
4,DOID:10652,DB00245,Alzheimer's disease,Benzatropine,SYM,pharmacotherapydb


In [18]:
query = """
SELECT "compound_cid" as drugbank_id, "disease_cid" as doid_id, count(*) AS n_sentences
FROM compound_disease
GROUP BY "compound_cid", "disease_cid";
"""
compound_disease_sentence_df = pd.read_sql(query, conn)
compound_disease_sentence_df >> ply.slice_rows(5)

Unnamed: 0,drugbank_id,doid_id,n_sentences
0,DB00001,DOID:13241,2
1,DB00001,DOID:3393,8
2,DB00001,DOID:784,1
3,DB00001,DOID:9074,1
4,DB00006,DOID:10283,1


## Merge all dataframes into One

In [19]:
compound_treats_disease_df = (
    compound_disease_map_df
    >> ply.left_join(
        hetionet_ctpd_df
        >> ply.query("category=='DM'")
        >> ply.select("doid_id", "drugbank_id", "category", "sources"),
        on=["drugbank_id", "doid_id"],
    )
    >> ply.left_join(compound_disease_sentence_df, on=["drugbank_id", "doid_id"])
    >> ply.call(".fillna", {"n_sentences": 0})
    >> ply.call(".astype", {"n_sentences": int})
    >> ply.define(
        hetionet="sources.notnull().astype(int)",
        has_sentence="(n_sentences > 0).astype(int)",
    )
)
compound_treats_disease_df >> ply.slice_rows(5)

Unnamed: 0,drugbank_id,drug_name,doid_id,doid_name,category,sources,n_sentences,hetionet,has_sentence
0,DB00001,Lepirudin,DOID:2531,hematologic cancer,,,0,0,0
1,DB00001,Lepirudin,DOID:1319,brain cancer,,,0,0,0
2,DB00001,Lepirudin,DOID:1324,lung cancer,,,0,0,0
3,DB00001,Lepirudin,DOID:263,kidney cancer,,,0,0,0
4,DB00001,Lepirudin,DOID:1793,pancreatic cancer,,,0,0,0


In [20]:
outfile = "compound_treats_disease.tsv.xz"
if not Path(outfile).exists():
    (
        compound_treats_disease_df
        >> ply.call(".to_csv", outfile, sep="\t", index=False, compression="xz")
    )

In [21]:
# free memory for rest of notebook
del drugbank_df
del disease_ontology_df
del compound_disease_map_df
del compound_disease_sentence_df
del hetionet_ctpd_df
del compound_treats_disease_df

# Compound binds Gene

## Compound and Gene Info URLs for Hetionet

In [22]:
compound_url = "https://raw.githubusercontent.com/dhimmel/drugbank/7b94454b14a2fa4bb9387cb3b4b9924619cfbd3e/data/drugbank.tsv"
gene_url = "https://raw.githubusercontent.com/dhimmel/entrez-gene/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv"
cbg_url = "https://raw.githubusercontent.com/dhimmel/integrate/93feba1765fbcd76fd79e22f25121f5399629148/compile/CbG-binding.tsv"

## Load DataFrames for each URL

In [23]:
entrez_gene_df = (
    pd.read_csv(gene_url, sep="\t")
    >> ply.rename(dict(entrez_gene_id="GeneID", gene_symbol="Symbol"))
    >> ply.define(merge_key=1)
)
entrez_gene_df >> ply.slice_rows(5)

Unnamed: 0,tax_id,entrez_gene_id,gene_symbol,chromosome,map_location,type_of_gene,description,merge_key
0,9606,1,A1BG,19,19q13.4,protein-coding,alpha-1-B glycoprotein,1
1,9606,2,A2M,12,12p13.31,protein-coding,alpha-2-macroglobulin,1
2,9606,3,A2MP1,12,12p13.31,pseudo,alpha-2-macroglobulin pseudogene 1,1
3,9606,9,NAT1,8,8p22,protein-coding,N-acetyltransferase 1 (arylamine N-acetyltrans...,1
4,9606,10,NAT2,8,8p22,protein-coding,N-acetyltransferase 2 (arylamine N-acetyltrans...,1


In [24]:
drugbank_df = (
    pd.read_csv(compound_url, sep="\t")
    >> ply.rename(dict(drug_name="name"))
    >> ply.define(merge_key=1)
)
drugbank_df >> ply.slice_rows(5) >> ply.call(".transpose")

Unnamed: 0,0,1,2,3,4
drugbank_id,DB00001,DB00002,DB00003,DB00004,DB00005
drug_name,Lepirudin,Cetuximab,Dornase alfa,Denileukin diftitox,Etanercept
type,biotech,biotech,biotech,biotech,biotech
groups,approved,approved,approved,approved|investigational,approved|investigational
atc_codes,B01AE02,L01XC06,R05CB13,L01XX29,L04AB01
categories,Antithrombins|Fibrinolytic Agents,Antineoplastic Agents,Enzymes,Antineoplastic Agents,Immunosuppressive Agents
inchikey,,,,,
inchi,,,,,
description,Lepirudin is identical to natural hirudin exce...,Epidermal growth factor receptor binding FAB. ...,Dornase alfa is a biosynthetic form of human d...,A recombinant DNA-derived cytotoxic protein co...,Dimeric fusion protein consisting of the extra...
merge_key,1,1,1,1,1


In [25]:
hetionet_cbg_df = pd.read_csv(cbg_url, sep="\t") >> ply.call(
    ".astype", {"entrez_gene_id": int}
)
hetionet_cbg_df.head(2)

Unnamed: 0,drugbank_id,entrez_gene_id,sources,pubmed_ids,actions,affinity_nM,license,urls
0,DB00001,2147,DrugBank (target),10505536|10912644|11055889|11467439|11752352|1...,inhibitor,,CC BY-NC 4.0,
1,DB00002,712,DrugBank (target),17016423|17139284,,,CC BY-NC 4.0,


In [26]:
query = """
SELECT "compound_cid" AS drugbank_id, "gene_cid" AS entrez_gene_id, count(*) AS n_sentences
FROM compound_gene
GROUP BY "compound_cid", "gene_cid";
"""

compound_gene_sentence_df = (
    pd.read_sql(query, database_str)
    >> ply_tdy.separate_rows("entrez_gene_id", sep=";")
    >> ply.call(".astype", {"entrez_gene_id": int})
)
compound_gene_sentence_df.head(2)

Unnamed: 0,drugbank_id,entrez_gene_id,n_sentences
0,DB00001,100187907,3
1,DB00001,1511,2


## Merge all dataframes into One

In [27]:
compound_binds_gene_df = (
    compound_gene_sentence_df
    >> ply.inner_join(
        drugbank_df >> ply.select("drugbank_id", "drug_name"), on="drugbank_id"
    )
    >> ply.inner_join(
        entrez_gene_df >> ply.select("entrez_gene_id", "gene_symbol"),
        on="entrez_gene_id",
    )
    >> ply.left_join(
        hetionet_cbg_df >> ply.select("drugbank_id", "entrez_gene_id", "sources"),
        on=["drugbank_id", "entrez_gene_id"],
    )
    >> ply.call(".fillna", {"n_sentences": 0})
    >> ply.call(".astype", {"n_sentences": int})
    >> ply.define(
        hetionet="sources.notnull().astype(int)",
        has_sentence="(n_sentences > 0).astype(int)",
    )
)
compound_binds_gene_df >> ply.slice_rows(5)

Unnamed: 0,drugbank_id,entrez_gene_id,n_sentences,drug_name,gene_symbol,sources,hetionet,has_sentence
0,DB00001,100187907,3,Lepirudin,TRAP,,0,1
1,DB00063,100187907,1,Eptifibatide,TRAP,,0,1
2,DB00091,100187907,4,Cyclosporine,TRAP,,0,1
3,DB00118,100187907,1,S-Adenosylmethionine,TRAP,,0,1
4,DB00121,100187907,4,Biotin,TRAP,,0,1


In [28]:
outfile = "compound_binds_gene.tsv.xz"
if not Path(outfile).exists():
    (
        compound_binds_gene_df
        >> ply.call(".to_csv", outfile, sep="\t", index=False, compression="xz")
    )

In [29]:
# free memory for rest of notebook
del drugbank_df
del entrez_gene_df
del compound_gene_sentence_df
del hetionet_cbg_df
del compound_binds_gene_df

# Gene Interacts Gene

## Gene Info URLs for Hetionet

In [30]:
gene_url = "https://raw.githubusercontent.com/dhimmel/entrez-gene/a7362748a34211e5df6f2d185bb3246279760546/data/genes-human.tsv"
ppi_url = "https://raw.githubusercontent.com/dhimmel/ppi/f6a7edbc8de6ba2d7fe1ef3fee4d89e5b8d0b900/data/ppi-hetio-ind.tsv"

## Load DataFrames for each URL

In [31]:
entrez_gene_df = (
    pd.read_csv(gene_url, sep="\t")
    >> ply.rename(dict(entrez_gene_id="GeneID", gene_symbol="Symbol"))
    >> ply.define(merge_key=1)
)
entrez_gene_df >> ply.slice_rows(5)

Unnamed: 0,tax_id,entrez_gene_id,gene_symbol,chromosome,map_location,type_of_gene,description,merge_key
0,9606,1,A1BG,19,19q13.4,protein-coding,alpha-1-B glycoprotein,1
1,9606,2,A2M,12,12p13.31,protein-coding,alpha-2-macroglobulin,1
2,9606,3,A2MP1,12,12p13.31,pseudo,alpha-2-macroglobulin pseudogene 1,1
3,9606,9,NAT1,8,8p22,protein-coding,N-acetyltransferase 1 (arylamine N-acetyltrans...,1
4,9606,10,NAT2,8,8p22,protein-coding,N-acetyltransferase 2 (arylamine N-acetyltrans...,1


In [32]:
hetionet_gig_df = pd.read_csv(ppi_url, sep="\t") >> ply.rename(
    dict(gene1_id="gene_0", gene2_id="gene_1")
)
hetionet_gig_df >> ply.slice_rows(5)

Unnamed: 0,gene1_id,gene2_id,sources,unbiased
0,1,310,hetio-dag,0
1,1,1026,hetio-dag,0
2,1,2886,hetio-dag,0
3,1,3958,hetio-dag,0
4,1,6606,hetio-dag,0


In [33]:
query = """
SELECT "gene1_cid" AS gene1_id, "gene2_cid" AS gene2_id, count(*) AS n_sentences
FROM gene_gene
GROUP BY "gene1_cid", "gene2_cid";
"""

gene_gene_sentence_df = (
    pd.read_sql(query, database_str)
    >> ply_tdy.separate_rows("gene1_id", sep=";")
    >> ply_tdy.separate_rows("gene2_id", sep=";")
    >> ply.call(".astype", {"gene1_id": int, "gene2_id": int})
)
gene_gene_sentence_df >> ply.slice_rows(5)

Unnamed: 0,gene1_id,gene2_id,n_sentences
0,1,1,44
1,1,10321,3
2,1,12,1
3,1,139716,1
4,1,197,1


## Merge all dataframes into One

In [37]:
gene_interacts_gene_df = (
    gene_gene_sentence_df
    >> ply.left_join(hetionet_gig_df, on=["gene1_id", "gene2_id"])
    >> ply.call(".fillna", {"n_sentences": 0})
    >> ply.call(".astype", {"n_sentences": int})
    >> ply.define(
        hetionet="sources.notnull().astype(int)",
        has_sentence="(n_sentences > 0).astype(int)",
    )
    >> ply.inner_join(
        entrez_gene_df
        >> ply.select("entrez_gene_id", "gene_symbol")
        >> ply.rename(dict(gene1_id="entrez_gene_id", gene1_name="Symbol")),
        on="gene1_id",
    )
    >> ply.inner_join(
        entrez_gene_df
        >> ply.select("entrez_gene_id", "gene_symbol")
        >> ply.rename(dict(gene2_id="entrez_gene_id", gene2_name="Symbol")),
        on="gene2_id",
    )
)
gene_interacts_gene_df >> ply.slice_rows(5)

Unnamed: 0,gene1_id,gene2_id,n_sentences,sources,unbiased,hetionet,has_sentence,gene_symbol_x,gene_symbol_y
0,1,1,44,,,0,1,A1BG,A1BG
1,12,1,2,,,0,1,SERPINA3,A1BG
2,1398,1,1,,,0,1,CRK,A1BG
3,1719,1,2,,,0,1,DHFR,A1BG
4,197,1,6,,,0,1,AHSG,A1BG


In [38]:
outfile = "gene_interacts_gene.tsv.xz"
if not Path(outfile).exists():
    (
        gene_interacts_gene_df
        >> ply.call(".to_csv", outfile, sep="\t", index=False, compression="xz")
    )

In [39]:
# free memory for rest of notebook
del entrez_gene_df
del gene_gene_sentence_df
del hetionet_gig_df
del gene_interacts_gene_df