# Biological Process and Gene Metapath Data Gathering - All

This notebook focuses on gathering **all** data related to the following requirements:

- Each value from `BP.csv` is a source and each value
from `Gene.csv` is a target.
- Each source + target pairing may have a metapath which
is found within `metapaths.csv`.
- For each pair metapath we need the DWPC and p-value
stored in a table for reference.
- Ignore metapaths found within
`metapaths_ignore.csv`.

In [2]:
import pathlib

import pandas as pd
import pyarrow as pa
from pyarrow import csv

from hetionet_utils.database import HetionetNeo4j

In [4]:
# gather metapaths which are not in the metapaths_ignore.csv
df_metapaths = pd.read_csv("data/sources/metapaths.csv")
df_metapaths_ignore = pd.read_csv("data/sources/metapaths_ignore.csv")
df_metapaths = df_metapaths[
    ~df_metapaths["metapath"].isin(df_metapaths_ignore["metapath"])
]
df_metapaths.head()

Unnamed: 0,metapath
5,BPpGdAdG
6,BPpGdAeG
7,BPpGdAuG
8,BPpGeAdG
9,BPpGeAeG


In [5]:
# Load input CSV files into Arrow Tables
table_bioprocesses = csv.read_csv("data/sources/BP.csv").select(["id"])
table_genes = csv.read_csv("data/sources/Gene.csv").select(["id"])
table_metapaths = pa.Table.from_pandas(df_metapaths)

print(
    "Expected number of queries: ",
    (
        expected_queries := table_bioprocesses.num_rows
        * table_genes.num_rows
        * table_metapaths.num_rows
    ),
)

Expected number of queries:  11203627115


In [6]:
# build a sample result from HetionetNeo4j
hetiocli = HetionetNeo4j()
sample_result = hetiocli.get_metapath_data(
    source_id=str(table_bioprocesses[0][0]),
    target_id=int(str(table_genes[0][0])),
    metapath=str(table_metapaths[0][0]),
)
sample_result

Unnamed: 0,metapath,node_ids,rel_ids,PDP,percent_of_DWPC,score,PC,DWPC
0,BPpGdAdG,"[40731, 21753, 36969, 16764]","[1390556, 865578, 2128874]",4e-06,16.997932,-0.0,,
1,BPpGdAdG,"[40731, 32551, 36969, 16764]","[1196717, 11354, 2128874]",4e-06,16.778595,-0.0,12.0,2.5e-05
2,BPpGdAdG,"[40731, 39858, 36969, 16764]","[1220136, 2096090, 2128874]",2e-06,9.334748,-0.0,12.0,2.5e-05
3,BPpGdAdG,"[40731, 8043, 36969, 16764]","[1153859, 1995416, 2128874]",2e-06,8.169678,-0.0,12.0,2.5e-05
4,BPpGdAdG,"[40731, 30743, 11763, 16764]","[1837396, 1950404, 1188329]",2e-06,7.757325,-0.0,12.0,2.5e-05
5,BPpGdAdG,"[40731, 10526, 36969, 16764]","[132194, 85560, 2128874]",2e-06,7.532643,-0.0,12.0,2.5e-05
6,BPpGdAdG,"[40731, 45057, 36969, 16764]","[1985929, 450350, 2128874]",2e-06,6.985486,-0.0,12.0,2.5e-05
7,BPpGdAdG,"[40731, 12891, 11763, 16764]","[1079500, 2053410, 1188329]",2e-06,6.54538,-0.0,12.0,2.5e-05
8,BPpGdAdG,"[40731, 20458, 36969, 16764]","[18158, 962402, 2128874]",1e-06,6.009676,-0.0,12.0,2.5e-05
9,BPpGdAdG,"[40731, 9350, 36969, 16764]","[2091178, 285298, 2128874]",1e-06,4.833247,-0.0,12.0,2.5e-05


In [15]:
# export to file and measure the size
sample_result.to_parquet((filepath := "example_output.parquet"))
print(
    "Expected storage: ",
    (
        # bytes
        pathlib.Path(filepath).stat().st_size
        *
        # multiplied by the number of expected queries we need to make
        expected_queries
    )
    /
    # kilobytes
    1024
    /
    # megabytes
    1024
    /
    # gigabytes
    1024,
    "GB",
)

Expected storage:  63617.26161106955 GB
