# Explore DWPC prediction of systematic protein interactions

https://github.com/greenelab/connectivity-search-analyses/issues/176

In [1]:
import pandas as pd
import numpy as np
import plotnine as pn
import random
from sklearn.metrics import roc_auc_score

from hetmatpy.hetmat import HetMat
from hetmatpy.pipeline import combine_dwpc_dgp


In [2]:
url = "https://github.com/related-sciences/ensembl-genes/raw/output/homo_sapiens_core_105_38/xref_ncbigene.json.gz"
ensembl_xref = pd.read_json(url)
ensembl_xref.head(2)

Unnamed: 0,ensembl_representative_gene_id,ncbigene_id,gene_symbol,ncbigene_symbol
0,ENSG00000000003,7105,TSPAN6,TSPAN6
1,ENSG00000000005,64102,TNMD,TNMD


In [3]:
url = "https://github.com/related-sciences/ensembl-genes/raw/output/homo_sapiens_core_105_38/updates.json.gz"
ensembl_updates = (
    pd.read_json(url)
    .rename(columns={"ensembl_gene_id": "ensembl_representative_gene_id"})
    [["input_ensembl_gene_id", "ensembl_representative_gene_id"]]
    .merge(ensembl_xref)
)
ensembl_updates.head(2)

Unnamed: 0,input_ensembl_gene_id,ensembl_representative_gene_id,ncbigene_id,gene_symbol,ncbigene_symbol
0,ASMPATCHG00000000170,ENSG00000256229,90649,ZNF486,ZNF486
1,ENSG00000256229,ENSG00000256229,90649,ZNF486,ZNF486


In [4]:
url = "http://www.interactome-atlas.org/data/HuRI.tsv"
huri_df = pd.read_table(url, names=["source__input_ensembl_gene_id", "target__input_ensembl_gene_id"])
huri_df.head(2)

Unnamed: 0,source__input_ensembl_gene_id,target__input_ensembl_gene_id
0,ENSG00000000005,ENSG00000061656
1,ENSG00000000005,ENSG00000099968


In [5]:
huri_ncbi_df = (
    huri_df
    .merge(ensembl_updates.add_prefix("source__"))
    .merge(ensembl_updates.add_prefix("target__"))
    .drop_duplicates(subset=["source__ncbigene_id", "target__ncbigene_id"])
)
huri_ncbi_df.head(2)

Unnamed: 0,source__input_ensembl_gene_id,target__input_ensembl_gene_id,source__ensembl_representative_gene_id,source__ncbigene_id,source__gene_symbol,source__ncbigene_symbol,target__ensembl_representative_gene_id,target__ncbigene_id,target__gene_symbol,target__ncbigene_symbol
0,ENSG00000000005,ENSG00000061656,ENSG00000000005,64102,TNMD,TNMD,ENSG00000061656,6676,SPAG4,SPAG4
1,ENSG00000000005,ENSG00000099968,ENSG00000000005,64102,TNMD,TNMD,ENSG00000099968,23786,BCL2L13,BCL2L13


In [6]:
huri_ncbi_df.query("source__input_ensembl_gene_id != source__ensembl_representative_gene_id")

Unnamed: 0,source__input_ensembl_gene_id,target__input_ensembl_gene_id,source__ensembl_representative_gene_id,source__ncbigene_id,source__gene_symbol,source__ncbigene_symbol,target__ensembl_representative_gene_id,target__ncbigene_id,target__gene_symbol,target__ncbigene_symbol
46234,ENSG00000203812,ENSG00000233822,ENSG00000288825,8337,H2AC18,H2AC18,ENSG00000233822,8341,H2BC15,H2BC15
46235,ENSG00000203812,ENSG00000233822,ENSG00000288859,723790,H2AC19,H2AC19,ENSG00000233822,8341,H2BC15,H2BC15


In [7]:
huri_pairs = {tuple(sorted(pair)) for pair in zip(huri_ncbi_df.source__ncbigene_id, huri_ncbi_df.target__ncbigene_id)}
next(iter(huri_pairs))

(5027, 79666)

In [8]:
# path = "../data/hetionet-v1.0.hetmat"
# use HetMat from local connectivity-search-backend which has downloaded all path counts
path = "../../connectivity-search-backend/dj_hetmech_app/management/commands/downloads/hetionet-v1.0.hetmat"
hm = HetMat(path)
hm


<hetmatpy.hetmat.HetMat at 0x7fe3b745edc0>

In [9]:
metapath = hm.metagraph.get_metapath("GpBPpGcG")
metapath

GpBPpGcG

In [10]:
randomling = random.Random(0)
rows = list()
for row in combine_dwpc_dgp(hm, metapath, damping=0.5, ignore_zeros=False):
    pair = tuple(sorted((row["source_id"], row["target_id"])))
    if pair in huri_pairs:
        row["huri"] = True
        rows.append(row)
    elif randomling.random() <= 0.001:
        row["huri"] = False
        rows.append(row)
    if len(rows) > 10000:
        break
dwpc_df = pd.DataFrame(rows)
del rows
dwpc_df.head()


Unnamed: 0,source_id,target_id,source_name,target_name,source_degree,target_degree,path_count,dwpc,n,nnz,n_perms,mean_nz,sd_nz,p_value,huri
0,1,585,A1BG,BBS4,0,5,0,0.0,854343200,0,200,,,1.0,False
1,1,3848,A1BG,KRT1,0,1,0,0.0,3705034600,0,200,,,1.0,False
2,1,8676,A1BG,STX11,0,1,0,0.0,3705034600,0,200,,,1.0,False
3,1,9640,A1BG,ZNF592,0,10,0,0.0,325934400,0,200,,,1.0,False
4,1,9935,A1BG,MAFB,0,43,0,0.0,14815200,0,200,,,1.0,False


In [11]:
len(huri_pairs)

52182

In [12]:
dwpc_df.huri.value_counts()

False    8136
True     1865
Name: huri, dtype: int64

In [13]:
roc_auc_score(y_true=dwpc_df.huri, y_score=dwpc_df.path_count)

0.512433305390137

In [14]:
roc_auc_score(y_true=dwpc_df.huri, y_score=dwpc_df.dwpc)

0.5161468507227005

In [15]:
roc_auc_score(y_true=dwpc_df.huri, y_score=-np.log10(dwpc_df.p_value))

0.5203815959782887