# Identify canidate causal genes for pQTLs

In [1]:
import pandas
import numpy

import hetmech.hetmat

## Read pQTL associations and candidate genes

In [2]:
pqtl_df = pandas.read_table('pqtl.tsv')
loci_df = pandas.read_table('snps-to-locus-genes.tsv')
loci_df = loci_df.drop_duplicates()
snp_df = pqtl_df.merge(loci_df)
# Drop protein complexes
snp_df = snp_df.dropna(subset=['pqtl_entrez_id', 'locus_entrez_id'])
snp_df = snp_df[~snp_df.pqtl_entrez_id.str.contains(',')]
for col in 'pqtl_entrez_id', 'locus_entrez_id':
    snp_df[col] = snp_df[col].astype(int)
snp_df.head(2)

Unnamed: 0,snp,pqtl_gene,pqtl_entrez_id,neg_log10_p,gwas_source,locus_gene,locus_entrez_id,distance_to_snp
0,rs651007,SELE,6401,102.728,23300549,ABO,28,3217
1,rs651007,SELE,6401,102.728,23300549,SURF6,6838,50343


## Compute hetnet features

In [4]:
hetmat = hetmech.hetmat.HetMat('../hetionet-v1.0.hetmat/')

<hetmech.hetmat.HetMat at 0x7f9ef864e4a8>

In [6]:
metapath = 'GpBPpGiG'

In [15]:
# Memory efficient reading (drop DWPC rows for unobserved gene-pairs)
gene_pairs = set(zip(snp_df.pqtl_entrez_id, snp_df.locus_entrez_id))
path = hetmat.directory.joinpath('adjusted-path-counts', 'dwpc-0.5', 'adjusted-dwpcs', f'{metapath}-filtered.tsv.gz')
chunks = pandas.read_table(path, chunksize=100_000)
dfs = list()
for df in chunks:
    keep = [pair in gene_pairs for pair in zip(df.source_id, df.target_id)]
    dfs.append(df.loc[keep, :])
dwpc_df = pandas.concat(dfs)
len(dwpc_df)

5580

In [14]:
dwpc_df.head()

Unnamed: 0,source_id,target_id,source_name,target_name,source_degree,target_degree,path_count,dwpc,n,nnz,n_perms,mean_nz,sd_nz,p_value
2804,12,5104,SERPINA3,SERPINA5,19,22,18,2.794127,7219200,7218939,200,1.384344,0.507804,0.011239
2816,12,5265,SERPINA3,SERPINA1,19,37,12,2.92852,3276800,3276600,200,1.671001,0.446058,0.008134
3306,12,57062,SERPINA3,DDX24,19,43,44,2.970499,1843200,1843200,200,1.755517,0.426618,0.007159
7788,16,6203,AARS,RPS9,58,262,1029,3.614633,44400,44400,200,3.361336,0.146249,0.044126
8203,16,10990,AARS,LILRB5,58,1,9,1.616004,36748400,31504598,200,0.401718,0.489,0.027999


In [21]:
snp_dwpc_df = (
    snp_df
    .merge(dwpc_df, left_on=['pqtl_entrez_id', 'locus_entrez_id'], right_on=['source_id', 'target_id'], how='left')
    .drop(columns=['source_id', 'target_id', 'source_name', 'target_name'])
)
snp_dwpc_df.head(2)

Unnamed: 0,snp,pqtl_gene,pqtl_entrez_id,neg_log10_p,gwas_source,locus_gene,locus_entrez_id,distance_to_snp,source_degree,target_degree,path_count,dwpc,n,nnz,n_perms,mean_nz,sd_nz,p_value
0,rs651007,SELE,6401,102.728,23300549,ABO,28,3217,,,,,,,,,,
1,rs651007,SELE,6401,102.728,23300549,SURF6,6838,50343,,,,,,,,,,


In [22]:
snp_dwpc_df.to_csv('candidates-GpBPpGiG.tsv', index=False, sep='\t')

## Potential candidates of interest

In [23]:
groups = snp_dwpc_df.groupby(['gwas_source', 'snp', 'pqtl_gene'])
groups = [df for i, df in groups]

In [24]:
groups[2]

Unnamed: 0,snp,pqtl_gene,pqtl_entrez_id,neg_log10_p,gwas_source,locus_gene,locus_entrez_id,distance_to_snp,source_degree,target_degree,path_count,dwpc,n,nnz,n_perms,mean_nz,sd_nz,p_value
118305,rs10127775,MAP2K4,6416,13.165,10.1101/086793,GALNT2,2590,102052,58.0,4.0,90.0,3.077606,14578000.0,14516202.0,200.0,0.992799,0.590786,0.005898
118306,rs10127775,MAP2K4,6416,13.165,10.1101/086793,LINC01736,101927532,152880,,,,,,,,,,
118307,rs10127775,MAP2K4,6416,13.165,10.1101/086793,PGBD5,79605,154440,,,,,,,,,,
118308,rs10127775,MAP2K4,6416,13.165,10.1101/086793,LINC01737,101927553,432618,,,,,,,,,,
118309,rs10127775,MAP2K4,6416,13.165,10.1101/086793,COG2,22796,482447,,,,,,,,,,
118310,rs10127775,MAP2K4,6416,13.165,10.1101/086793,URB2,9816,499843,,,,,,,,,,
118311,rs10127775,MAP2K4,6416,13.165,10.1101/086793,TAF5L,27097,533995,,,,,,,,,,
118312,rs10127775,MAP2K4,6416,13.165,10.1101/086793,AGT,183,542481,,,,,,,,,,
118313,rs10127775,MAP2K4,6416,13.165,10.1101/086793,CAPN9,10753,587342,,,,,,,,,,
118314,rs10127775,MAP2K4,6416,13.165,10.1101/086793,ABCB10,23456,601347,,,,,,,,,,


In [25]:
groups[7]

Unnamed: 0,snp,pqtl_gene,pqtl_entrez_id,neg_log10_p,gwas_source,locus_gene,locus_entrez_id,distance_to_snp,source_degree,target_degree,path_count,dwpc,n,nnz,n_perms,mean_nz,sd_nz,p_value
111938,rs10494745,HPX,3263,66.571,10.1101/086793,CFHR4,10877,645,79.0,3.0,30.0,3.611719,7830600.0,7770566.0,200.0,0.964608,0.603812,0.001982
111939,rs10494745,HPX,3263,66.571,10.1101/086793,CFHR2,3080,25445,,,,,,,,,,
111940,rs10494745,HPX,3263,66.571,10.1101/086793,CFHR5,81494,59229,79.0,1.0,23.0,4.141619,15394600.0,13574476.0,200.0,0.452063,0.511281,0.000365
111941,rs10494745,HPX,3263,66.571,10.1101/086793,CFHR1,3078,86138,79.0,4.0,28.0,2.768193,6107000.0,6095144.0,200.0,1.12136,0.591223,0.014682
111942,rs10494745,HPX,3263,66.571,10.1101/086793,F13B,2165,120864,,,,,,,,,,
111943,rs10494745,HPX,3263,66.571,10.1101/086793,CFHR3,10878,122921,79.0,2.0,24.0,3.803143,9938600.0,9638663.0,200.0,0.761329,0.597103,0.00128
111944,rs10494745,HPX,3263,66.571,10.1101/086793,ASPM,259266,165801,,,,,,,,,,
111945,rs10494745,HPX,3263,66.571,10.1101/086793,CFH,3075,170823,79.0,18.0,108.0,3.687287,1178000.0,1177800.0,200.0,2.007228,0.397697,0.000253
111946,rs10494745,HPX,3263,66.571,10.1101/086793,ZBTB41,360023,235353,,,,,,,,,,
111947,rs10494745,HPX,3263,66.571,10.1101/086793,KCNT2,343450,309102,,,,,,,,,,


In [26]:
groups[8]

Unnamed: 0,snp,pqtl_gene,pqtl_entrez_id,neg_log10_p,gwas_source,locus_gene,locus_entrez_id,distance_to_snp,source_degree,target_degree,path_count,dwpc,n,nnz,n_perms,mean_nz,sd_nz,p_value
114634,rs10737680,CFH,3075,35.89,10.1101/086793,CFH,3075,37179,,,,,,,,,,
114635,rs10737680,CFH,3075,35.89,10.1101/086793,CFHR3,10878,64470,21.0,2.0,21.0,4.32071,75020400.0,65062638.0,200.0,0.449195,0.527825,0.000367
114636,rs10737680,CFH,3075,35.89,10.1101/086793,KCNT2,343450,101100,,,,,,,,,,
114637,rs10737680,CFH,3075,35.89,10.1101/086793,CFHR1,3078,109432,21.0,4.0,16.0,1.846759,46098000.0,44387506.0,200.0,0.647093,0.568289,0.041769
114638,rs10737680,CFH,3075,35.89,10.1101/086793,MIR4735,100616363,127844,,,,,,,,,,
114639,rs10737680,CFH,3075,35.89,10.1101/086793,CFHR4,10877,139916,21.0,3.0,27.0,4.141113,59108400.0,55000247.0,200.0,0.55874,0.555346,0.000531
114640,rs10737680,CFH,3075,35.89,10.1101/086793,CFHR2,3080,233447,,,,,,,,,,
114641,rs10737680,CFH,3075,35.89,10.1101/086793,CFHR5,81494,267231,21.0,1.0,21.0,4.667195,116204400.0,86925262.0,200.0,0.273747,0.431237,0.000104
114642,rs10737680,CFH,3075,35.89,10.1101/086793,F13B,2165,328866,21.0,2.0,3.0,2.218504,75020400.0,65062638.0,200.0,0.449195,0.527825,0.012727
114643,rs10737680,CFH,3075,35.89,10.1101/086793,ASPM,259266,373803,,,,,,,,,,
