<a href="https://colab.research.google.com/github/fbelleau/kibio-bi/blob/main/Demo_case_2_public.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Eland librairy compatible with actual Kibio.science version

In [None]:
!pip install eland==7.14.0b1

import eland as ed

## Install most recent version of Pandas (default on Collab is 1.3.5)

In [2]:


import pandas as pd

pd.__version__

'1.3.5'

## Import Elasticsearch's needed librairies

In [3]:
from elasticsearch import Elasticsearch, helpers
import matplotlib.pyplot as plt

## Create connection to Kibio.science's Elastiseach REST endpoint

In [4]:
es = Elasticsearch("http://es.kibio.science:80", verify_certs=False, ssl_show_warn=False)
es

<Elasticsearch([{'host': 'es.kibio.science', 'port': 80}])>

## Dowload needed dataframes from Kibio.science using Elasticsearch's Eland librairy

In [20]:
# download nodes

disease = ed.eland_to_pandas(ed.DataFrame(es_client= es, es_index_pattern='hetionet_disease_201811'))
print("disease", disease.shape)
gene = ed.eland_to_pandas(ed.DataFrame(es_client= es, es_index_pattern='hetionet_gene_201811'))
print("gene", gene.shape)
compound = ed.eland_to_pandas(ed.DataFrame(es_client= es, es_index_pattern='hetionet_compound_201811'))
print("compound", compound.shape)
biological_process = ed.eland_to_pandas(ed.DataFrame(es_client= es, es_index_pattern='hetionet_biological_process_201811'))
print("biological_process", biological_process.shape)

disease (137, 17)
gene (20945, 17)
compound (1552, 20)
biological_process (11381, 11)


In [24]:
# edges_disease_association

disease_association = disease[['@id','associates.@id']]
disease_association_edges = disease_association.explode('associates.@id')
disease_association_edges.shape

(12626, 2)

In [22]:
# edges_gene_regulates

gene_regulates = gene[['@id','regulates.@id']]
gene_regulates_edges = gene_regulates.explode('regulates.@id')
gene_regulates_edges.shape

(281983, 2)

In [51]:
# edges_gene_interacts

gene_interacts = gene[['@id','interacts.@id']]
gene_interacts_edges = gene_interacts.explode('interacts.@id')
gene_interacts_edges.shape

(158583, 2)

In [52]:
# edges_gene_participates

gene_participates = gene[['@id','participates.@id']]
gene_participates_edges = gene_participates.explode('participates.@id')
gene_participates_edges.shape

(819199, 2)

In [54]:
# edges_compound_binds

compound_binds = compound[['@id','binds.@id']]
compound_binds_edges = compound_binds.explode('binds.@id')
compound_binds_edges.shape

(11734, 2)

In [36]:
# download edges

edges_disease_associates = ed.eland_to_pandas(ed.DataFrame(es_client= es, es_index_pattern='hetionet_edges_disease_associates'))
print("edges_disease_associates", edges_disease_associates.shape)
edges_gene_interacts = ed.eland_to_pandas(ed.DataFrame(es_client= es, es_index_pattern='hetionet_edges_gene_interacts'))
print("edges_gene_interacts", edges_gene_interacts.shape)
edges_gene_participates = ed.eland_to_pandas(ed.DataFrame(es_client= es, es_index_pattern='hetionet_edges_gene_participates'))
print("edges_gene_participates", edges_gene_participates.shape)
edges_gene_regulates = ed.eland_to_pandas(ed.DataFrame(es_client= es, es_index_pattern='hetionet_edges_gene_regulates'))
print("edges_gene_regulates", edges_gene_regulates.shape)
edges_compound_binds = ed.eland_to_pandas(ed.DataFrame(es_client= es, es_index_pattern='hetionet_edges_compound_binds'))
print("edges_compound_binds", edges_compound_binds.shape)

edges_disease_associates (12626, 6)
edges_gene_interacts (158583, 6)
edges_gene_participates (819199, 6)
edges_gene_regulates (281983, 6)
edges_compound_binds (11734, 6)


## Question 1

In [61]:
answer_question_1_1 = gene[gene['regulates.@id'] == gene['regulates.@id']]
print(answer_question_1_1.shape)

answer_question_1_2 = answer_question_1_1[['@id', 'regulates.@id']]
print(answer_question_1_2.shape)

answer_question_1_3 = answer_question_1_2[answer_question_1_2['regulates.@id'].apply(len) > 15]

answer_question_1_3.shape

(4634, 17)
(4634, 2)


(3599, 2)

In [62]:
# version Pascal

gene_regulates_gene = edges_gene_regulates[['@id', '@id_to']].groupby(['@id'],as_index=False).agg({'@id_to': lambda x: x.tolist()})
gene_regulates_gene.columns = ['gene_1', 'gene_2_list']
answer_question_1 = gene_regulates_gene[gene_regulates_gene['gene_2_list'].apply(len) > 15]

answer_question_1.shape

(3599, 2)

## Question 2

In [65]:
compound_gene = pd.merge(compound[['@id', 'name']], edges_compound_binds[['@id','@id_to']], on='@id')
compound_gene.columns = ['compound_id', 'compound_name', 'gene_id']

compound_gene_disease = pd.merge(compound_gene, edges_disease_associates[['@id', '@id_to']], left_on='gene_id', right_on='@id_to').drop(['@id_to'], axis=1)
compound_gene_disease.columns = ['compound_id', 'compound_name', 'gene_id', 'disease_id']

compound_gene_disease = pd.merge(compound_gene_disease, disease[['@id','name']], left_on='disease_id', right_on='@id', how='inner').drop(['@id'], axis=1)
compound_gene_disease.columns = ['compound_id', 'compound_name', 'gene_id', 'disease_id', 'disease_name']

compound_gene_disease.shape


(47305, 5)

In [66]:

compound_gene_disease = compound_gene_disease.query('compound_name.str.contains("Valproic Acid")').groupby(['disease_name'],as_index=False).agg({'gene_id': lambda x: x.tolist()})
compound_gene_disease['num_of_genes'] = compound_gene_disease['gene_id'].str.len()

answer_question_2 = compound_gene_disease.sort_values(by=['num_of_genes'], ascending=False).head(n=3)

answer_question_2

TypeError: ignored

## Download edges

## Question 3

In [78]:
disease_1 = disease.query('name == "multiple sclerosis"')

In [77]:
disease

Unnamed: 0,@context,@id,@label,@namespace,@type,DOID.@id,associates.@id,data.license,data.source,data.url,downregulates.@id,kind,localizes.@id,name,presents.@id,resembles.@id,upregulates.@id
DOID:363,http://schema.org/,DOID:363,uterine cancer [DOID:363],DOID,hetionet:Disease,DOID:363,"[ncbigene:4436, ncbigene:7869, ncbigene:3480, ...",CC BY 3.0,Disease Ontology,http://purl.obolibrary.org/obo/DOID_363,,Disease,"[UBERON:0000992, UBERON:0002512, UBERON:000099...",uterine cancer,"[Symptom:D000860, Symptom:D014549, Symptom:D00...","[DOID:1245, DOID:119, DOID:175, DOID:13223]",
DOID:14268,http://schema.org/,DOID:14268,sclerosing cholangitis [DOID:14268],DOID,hetionet:Disease,DOID:14268,"[ncbigene:6925, ncbigene:251, ncbigene:2805, n...",CC BY 3.0,Disease Ontology,http://purl.obolibrary.org/obo/DOID_14268,,Disease,"[UBERON:0002110, UBERON:0001193, UBERON:000117...",sclerosing cholangitis,"[Symptom:D007565, Symptom:D041781, Symptom:D00...",DOID:8577,
DOID:9352,http://schema.org/,DOID:9352,type 2 diabetes mellitus [DOID:9352],DOID,hetionet:Disease,DOID:9352,"[ncbigene:200186, ncbigene:6647, ncbigene:1112...",CC BY 3.0,Disease Ontology,http://purl.obolibrary.org/obo/DOID_9352,,Disease,"[UBERON:0002103, UBERON:0001021, UBERON:000238...",type 2 diabetes mellitus,"[Symptom:D005483, Symptom:D054058, Symptom:D00...",,ncbigene:57524
DOID:8778,http://schema.org/,DOID:8778,Crohn's disease [DOID:8778],DOID,hetionet:Disease,DOID:8778,"[ncbigene:1238, ncbigene:120892, ncbigene:3576...",CC BY 3.0,Disease Ontology,http://purl.obolibrary.org/obo/DOID_8778,"[ncbigene:10206, ncbigene:5243, ncbigene:37519...",Disease,"[UBERON:0002378, UBERON:0002110, UBERON:000201...",Crohn's disease,"[Symptom:D003967, Symptom:D015746, Symptom:D00...","[DOID:14268, DOID:7147, DOID:13499, DOID:10608...","[ncbigene:30817, ncbigene:9601, ncbigene:211, ..."
DOID:1612,http://schema.org/,DOID:1612,breast cancer [DOID:1612],DOID,hetionet:Disease,DOID:1612,"[ncbigene:3190, ncbigene:8871, ncbigene:4904, ...",CC BY 3.0,Disease Ontology,http://purl.obolibrary.org/obo/DOID_1612,"[ncbigene:55084, ncbigene:4609, ncbigene:1756,...",Disease,"[UBERON:0002066, UBERON:0003889, UBERON:000191...",breast cancer,"[Symptom:D059373, Symptom:D001247, Symptom:D01...","[DOID:219, DOID:2394]","[ncbigene:23753, ncbigene:26585, ncbigene:7989..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DOID:784,http://schema.org/,DOID:784,chronic kidney failure [DOID:784],DOID,hetionet:Disease,DOID:784,"[ncbigene:5243, ncbigene:83879, ncbigene:1543,...",CC BY 3.0,Disease Ontology,http://purl.obolibrary.org/obo/DOID_784,,Disease,"[UBERON:0002386, UBERON:0001007, UBERON:000140...",chronic kidney failure,"[Symptom:D054058, Symptom:D000787, Symptom:D00...","[DOID:10976, DOID:10763, DOID:0050425, DOID:1312]",
DOID:986,http://schema.org/,DOID:986,alopecia areata [DOID:986],DOID,hetionet:Disease,DOID:986,"[ncbigene:79465, ncbigene:4283, ncbigene:920, ...",CC BY 3.0,Disease Ontology,http://purl.obolibrary.org/obo/DOID_986,"[ncbigene:84445, ncbigene:9632, ncbigene:8774,...",Disease,"[UBERON:0001037, UBERON:0002073, UBERON:000182...",alopecia areata,"[Symptom:D011538, Symptom:D006311, Symptom:D00...","[DOID:8893, DOID:12306, DOID:3310]","[ncbigene:5551, ncbigene:2581, ncbigene:53344,..."
DOID:10534,http://schema.org/,DOID:10534,stomach cancer [DOID:10534],DOID,hetionet:Disease,DOID:10534,"[ncbigene:5886, ncbigene:84634, ncbigene:1674,...",CC BY 3.0,Disease Ontology,http://purl.obolibrary.org/obo/DOID_10534,,Disease,"[UBERON:0002017, UBERON:0002466, UBERON:000113...",stomach cancer,"[Symptom:D006356, Symptom:D056865, Symptom:D00...",DOID:13223,
DOID:2841,http://schema.org/,DOID:2841,asthma [DOID:2841],DOID,hetionet:Disease,DOID:2841,"[ncbigene:10419, ncbigene:3135, ncbigene:4049,...",CC BY 3.0,Disease Ontology,http://purl.obolibrary.org/obo/DOID_2841,,Disease,"[UBERON:0002365, UBERON:0001037, UBERON:000001...",asthma,"[Symptom:D006985, Symptom:D006685, Symptom:D00...","[DOID:3310, DOID:3083]","[ncbigene:22905, ncbigene:1469, ncbigene:40142..."


In [82]:
disease_1 = disease.query('name == "multiple sclerosis"')
biological_process_1 = biological_process.query('name == "retina layer formation"')

disease_gene1 = pd.merge(disease_1[['@id','name']], edges_disease_associates[['@id','@id_to']], on='@id')
disease_gene1.columns = ['disease_id', 'disease_name', 'gene_1_id']

disease_gene1_gene_2 = pd.merge(disease_gene1, edges_gene_interacts[['@id', '@id_to']], left_on = 'gene_1_id', right_on = '@id').drop(['@id'], axis=1)
disease_gene1_gene_2.columns = ['disease_id', 'disease_name', 'gene_1_id', 'gene_2_id']

disease_gene1_gene_2_process = pd.merge(disease_gene1_gene_2, edges_gene_participates[['@id', '@id_to']], left_on = 'gene_2_id', right_on = '@id').drop(['@id'], axis=1)
disease_gene1_gene_2_process.columns = ['disease_id', 'disease_name', 'gene_1_id', 'gene_2_id', 'process_id']
disease_gene1_gene_2_process = pd.merge(disease_gene1_gene_2_process, biological_process_1[['@id', 'name']], left_on = 'process_id', right_on = '@id').drop(['@id'], axis=1)

disease_gene1_gene_2_process.columns = ['disease_id', 'disease_name', 'gene_1_id', 'gene_2_id', 'process_id', 'process_name']

disease_gene1_gene_2_process

Unnamed: 0,disease_id,disease_name,gene_1_id,gene_2_id,process_id,process_name
0,DOID:2377,multiple sclerosis,ncbigene:5796,ncbigene:5797,GO:0010842,retina layer formation
1,DOID:2377,multiple sclerosis,ncbigene:4609,ncbigene:7021,GO:0010842,retina layer formation
2,DOID:2377,multiple sclerosis,ncbigene:4609,ncbigene:7020,GO:0010842,retina layer formation


In [90]:
disease_1 = disease.query('name == "multiple sclerosis"')
biological_process_1 = biological_process.query('name == "retina layer formation"')

disease_gene1 = pd.merge(disease_1[['@id','name']], disease_association_edges[['@id','associates.@id']], on='@id')
disease_gene1.columns = ['disease_id', 'disease_name', 'gene_1_id']
disease_gene1.head

disease_gene1

disease_gene1_gene_2 = pd.merge(disease_gene1, gene_interacts_edges[['@id', '@id_to']], left_on = 'gene_1_id', right_on = '@id').drop(['@id'], axis=1)
disease_gene1_gene_2.columns = ['disease_id', 'disease_name', 'gene_1_id', 'gene_2_id']


<bound method NDFrame.head of     disease_id        disease_name      gene_1_id
0    DOID:2377  multiple sclerosis   ncbigene:959
1    DOID:2377  multiple sclerosis  ncbigene:7099
2    DOID:2377  multiple sclerosis  ncbigene:6723
3    DOID:2377  multiple sclerosis   ncbigene:958
4    DOID:2377  multiple sclerosis  ncbigene:3627
..         ...                 ...            ...
145  DOID:2377  multiple sclerosis   ncbigene:627
146  DOID:2377  multiple sclerosis  ncbigene:6774
147  DOID:2377  multiple sclerosis  ncbigene:6404
148  DOID:2377  multiple sclerosis  ncbigene:6352
149  DOID:2377  multiple sclerosis  ncbigene:7412

[150 rows x 3 columns]>

In [85]:
disease_association_edges

Unnamed: 0,@id,associates.@id
DOID:363,DOID:363,ncbigene:4436
DOID:363,DOID:363,ncbigene:7869
DOID:363,DOID:363,ncbigene:3480
DOID:363,DOID:363,ncbigene:332
DOID:363,DOID:363,ncbigene:3576
...,...,...
DOID:13189,DOID:13189,ncbigene:3557
DOID:13189,DOID:13189,ncbigene:5634
DOID:13189,DOID:13189,ncbigene:55867
DOID:13189,DOID:13189,ncbigene:6568
