# Explore search

In [1]:
import collections

import pandas
import hetio.readwrite
import numpy

from hetmech.degree_weight import dwpc
from hetmech.matrix import get_node_to_position

In [2]:
repo_url = 'https://github.com/dhimmel/hetionet'
commit = '6d26d15e9055b33b4fd97a180fa288e4f2060b96'
names = ['hetionet-v1.0'] + [f'hetionet-v1.0-perm-{i + 1}' for i in range(5)]    
paths = ['hetnet/json/hetionet-v1.0.json.bz2'] + [
    f'hetnet/permuted/json/{name}.json.bz2' for name in names[1:]
]
hetnets = collections.OrderedDict()
for name, path in zip(names, paths):
    url = f'{repo_url}/raw/{commit}/{path}'
    hetnets[name] = hetio.readwrite.read_graph(url)

In [3]:
list(hetnets)

['hetionet-v1.0',
 'hetionet-v1.0-perm-1',
 'hetionet-v1.0-perm-2',
 'hetionet-v1.0-perm-3',
 'hetionet-v1.0-perm-4',
 'hetionet-v1.0-perm-5']

In [25]:
DWPCs = collections.OrderedDict()
for name, graph in hetnets.items():
    metapath = graph.metagraph.metapath_from_abbrev('GiGpBP')
    rows, cols, dwpc_matrix, seconds = dwpc(graph, metapath, damping=0.4)
    DWPCs_GiGpPW[name] = dwpc_matrix
    print(f'Computing DWPC matrix for the {metapath} metapath in {name} took {seconds:.1f} seconds')

Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0 took 128.2 seconds
Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0-perm-1 took 163.4 seconds
Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0-perm-2 took 129.1 seconds
Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0-perm-3 took 130.4 seconds
Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0-perm-4 took 124.2 seconds
Computing DWPC matrix for the GiGpBP metapath in hetionet-v1.0-perm-5 took 121.0 seconds


In [26]:
metapath.get_unicode_str()

'Gene–interacts–Gene–participates–Biological Process'

## Read diffex

In [27]:
# Differentially expressed blood pressure genes from https://doi.org/10.1371/journal.pgen.1005035
url = 'https://doi.org/10.1371/journal.pgen.1005035.s006'
bp_df = (
    pandas.read_excel(url, skiprows=[0, 2])
    .rename(columns={
        'EntrezGeneID_FHS': 'entrez_gene_id',
    })
    .dropna(subset=['entrez_gene_id'])
    .drop_duplicates(subset=['entrez_gene_id'])
    .query("BP_sixCohort_meta_p < 0.001")
    [['entrez_gene_id', 'BP_sixCohort_meta_TE', 'BP_sixCohort_meta_p']]
)

# Entrez Genes should be ints
bp_df.entrez_gene_id = bp_df.entrez_gene_id.astype(int)

# Replace p-values that are zero
bp_df.loc[bp_df.BP_sixCohort_meta_p == 0, 'BP_sixCohort_meta_p'] = 1e-15
bp_df['weight'] = bp_df.BP_sixCohort_meta_TE * -numpy.log10(bp_df.BP_sixCohort_meta_p)
bp_df['weight_down'] = numpy.maximum(-bp_df.weight, 0)
bp_df['weight_up'] = numpy.maximum(bp_df.weight, 0)

bp_df.head(2)

Unnamed: 0,entrez_gene_id,BP_sixCohort_meta_TE,BP_sixCohort_meta_p,weight,weight_down,weight_up
0,1318,0.002282,1e-15,0.034224,0.0,0.034224
1,91663,0.002578,1e-15,0.038671,0.0,0.038671


In [28]:
pandas.Series(numpy.sign(bp_df.weight)).value_counts()

 1.0    68
-1.0    65
Name: weight, dtype: int64

In [29]:
gene_df = (
    pandas.DataFrame({
        'entrez_gene_id': rows,
        'gene_symbol': [graph.get_node((metapath.source().identifier, x)).name for x in rows],
    })
    .merge(bp_df, how='left')
    [['entrez_gene_id', 'gene_symbol', 'weight', 'weight_down', 'weight_up']]
    .fillna(0)
)

gene_df.head(2)

Unnamed: 0,entrez_gene_id,gene_symbol,weight,weight_down,weight_up
0,1,A1BG,0.0,0.0,0.0
1,2,A2M,0.0,0.0,0.0


## Compute target node scores

In [30]:
target_df = pandas.DataFrame({
    'metapath': str(metapath),
    'target_id': cols,
    'target_name': [graph.get_node((metapath.target().identifier, x)).name for x in cols],
}).set_index(['metapath', 'target_id', 'target_name'])

for name, array in DWPCs.items():
    target_df[name] = gene_df.weight_up @ array

# Scaling as per https://think-lab.github.io/d/193/#4
dwpc_scaler = target_df['hetionet-v1.0'].mean()
target_df = numpy.arcsinh(target_df / dwpc_scaler)

perm_df = target_df.iloc[:, 1:]
target_df['z-score'] = (target_df.iloc[:, 0] - perm_df.mean(axis='columns')) / perm_df.std(axis='columns')

(
    target_df
    # Remove targets without sufficient nonzero DWPCs
    [(perm_df > 0).sum(axis='columns') >= 3]
    .sort_values('z-score', ascending=False)
    .head(20)
)

KeyError: 'hetionet-v1.0'