# Identify canidate causal genes for pQTLs

In [1]:
import collections
import pandas
import numpy
import scipy.sparse
import tqdm
import hetio.readwrite

import hetmech.degree_weight

In [2]:
pqtl_df = pandas.read_table('pqtl.tsv')
loci_df = pandas.read_table('snps-to-locus-genes.tsv')
loci_df = loci_df.drop_duplicates()
snp_df = pqtl_df.merge(loci_df)
# Drop protein complexes
snp_df = snp_df.dropna(subset=['pqtl_entrez_id', 'locus_entrez_id'])
snp_df = snp_df[~snp_df.pqtl_entrez_id.str.contains(',')]
for col in 'pqtl_entrez_id', 'locus_entrez_id':
    snp_df[col] = snp_df[col].astype(int)
snp_df.head(2)

Unnamed: 0,snp,pqtl_gene,pqtl_entrez_id,neg_log10_p,gwas_source,locus_gene,locus_entrez_id,distance_to_snp
0,rs651007,SELE,6401,102.728,23300549,ABO,28,3217
1,rs651007,SELE,6401,102.728,23300549,SURF6,6838,50343


## Compute hetnet features

In [3]:
repo_url = 'https://github.com/dhimmel/hetionet'
commit = '6d26d15e9055b33b4fd97a180fa288e4f2060b96'
names = ['hetionet-v1.0'] + [f'hetionet-v1.0-perm-{i + 1}' for i in range(5)]    
paths = ['hetnet/json/hetionet-v1.0.json.bz2'] + [
    f'hetnet/permuted/json/{name}.json.bz2' for name in names[1:]
]
hetnets = collections.OrderedDict()
for name, path in zip(names, paths):
    url = f'{repo_url}/raw/{commit}/{path}'
    hetnets[name] = hetio.readwrite.read_graph(url)
list(hetnets)

['hetionet-v1.0',
 'hetionet-v1.0-perm-1',
 'hetionet-v1.0-perm-2',
 'hetionet-v1.0-perm-3',
 'hetionet-v1.0-perm-4',
 'hetionet-v1.0-perm-5']

In [4]:
DWPCs = collections.OrderedDict()
for name, graph in hetnets.items():
    metapath = graph.metagraph.metapath_from_abbrev('GpBPpG')
    row_ids, col_ids, dwpc_matrix, seconds = hetmech.degree_weight.dwpc(graph, metapath, damping=0.4, dense_threshold=0.6)
    DWPCs[name] = dwpc_matrix
    print(f'Computing DWPC matrix for the {metapath} metapath in {name} took {seconds:.1f} seconds')

Computing DWPC matrix for the GpBPpG metapath in hetionet-v1.0 took 5.5 seconds
Computing DWPC matrix for the GpBPpG metapath in hetionet-v1.0-perm-1 took 6.9 seconds
Computing DWPC matrix for the GpBPpG metapath in hetionet-v1.0-perm-2 took 7.0 seconds
Computing DWPC matrix for the GpBPpG metapath in hetionet-v1.0-perm-3 took 6.9 seconds
Computing DWPC matrix for the GpBPpG metapath in hetionet-v1.0-perm-4 took 6.9 seconds
Computing DWPC matrix for the GpBPpG metapath in hetionet-v1.0-perm-5 took 6.9 seconds


In [5]:
metapath.get_unicode_str()

'Gene–participates–Biological Process–participates–Gene'

In [6]:
# Scaling as per https://think-lab.github.io/d/193/#4
dwpc_scaler = DWPCs['hetionet-v1.0'].mean()
for name in list(DWPCs):
    matrix = DWPCs[name]
    if scipy.sparse.issparse(matrix):
        DWPCs[name] = matrix.multiply(dwpc_scaler ** -1).arcsinh()
    else:
        DWPCs[name] = numpy.arcsinh(matrix / dwpc_scaler)

### Unused code

In [7]:
# pdwpc_matrices = list(DWPCs.values())[1:]
# pdwpc_mean = sum(pdwpc_matrices) / len(pdwpc_matrices)
# pdwpc_mean[:6, :6].toarray()

In [8]:
# https://stackoverflow.com/a/24010418/4651668
# numpy.std(pdwpc_matrices, axis=0, ddof=1)

In [9]:
# def permutation_adjust(dwpc_matrices):
#     dwpc, *pdwpc_mats = dwpc_matrices
#     pdwpc = sum(pdwpc_mats) / len(pdwpc_mats)
#     sddwpc = (1 / (len(pdwpc_mats) - 1) * sum((x - pdwpc) ** 2 for x in pdwpc_mats)) ** 0.5
#     rdwpc = dwpc - pdwpc
#     zdwpc = rdwpc / sddwpc
#     return rdwpc, zdwpc

# # This took forever
# # rdwpc, zdwpc = permutation_adjust(DWPCs.values())

## Combine DWPC values with gene-pairs of interest

In [10]:
pair_df = snp_df[['pqtl_entrez_id', 'locus_entrez_id']].drop_duplicates()
row_to_position = {row: i for i, row in enumerate(row_ids)}
col_to_position = {col: i for i, col in enumerate(col_ids)}
pair_df['row_index'] = pair_df.pqtl_entrez_id.map(row_to_position)
pair_df['col_index'] = pair_df.locus_entrez_id.map(col_to_position)
pair_df = pair_df.dropna(subset=['row_index', 'col_index'])
for col in 'row_index', 'col_index':
    pair_df[col] = pair_df[col].astype(int)
pair_df.tail(2)

Unnamed: 0,pqtl_entrez_id,locus_entrez_id,row_index,col_index
131459,354,79290,250,12434
131460,354,26095,250,8756


In [11]:
rows = list()
indices = list(zip(pair_df.row_index, pair_df.col_index))
for i, j in tqdm.tqdm_notebook(indices):
    row = {
        'row_index': i,
        'col_index': j,
    }
    for name, matrix in DWPCs.items():
        row[name] = matrix[i, j]
    rows.append(row)
dwpc_df = (
    pair_df
    .merge(pandas.DataFrame(rows))
    .drop(columns=['row_index', 'col_index'])
    .set_index(['pqtl_entrez_id', 'locus_entrez_id'])
)
dwpc_df.tail(2)




Unnamed: 0_level_0,Unnamed: 1_level_0,hetionet-v1.0,hetionet-v1.0-perm-1,hetionet-v1.0-perm-2,hetionet-v1.0-perm-3,hetionet-v1.0-perm-4,hetionet-v1.0-perm-5
pqtl_entrez_id,locus_entrez_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
354,79290,0.0,1.97311,0.0,0.0,2.183583,0.0
354,26095,0.0,2.491907,0.0,0.0,0.0,0.0


In [12]:
perm_df = dwpc_df.iloc[:, 1:]
r_dwpc = dwpc_df.iloc[:, 0] - perm_df.mean(axis='columns')
z_dwpc = r_dwpc / perm_df.std(axis='columns')
dwpc_df['r-dwpc'] = r_dwpc
dwpc_df['z-dwpc'] = z_dwpc
dwpc_df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,hetionet-v1.0,hetionet-v1.0-perm-1,hetionet-v1.0-perm-2,hetionet-v1.0-perm-3,hetionet-v1.0-perm-4,hetionet-v1.0-perm-5,r-dwpc,z-dwpc
pqtl_entrez_id,locus_entrez_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6401,28,0.0,0.0,3.381078,0.0,1.838338,0.0,-1.043883,-0.682309
6401,6838,0.0,0.0,0.0,0.0,2.637118,0.0,-0.527424,-0.447214


In [13]:
candidate_df = snp_df.merge(dwpc_df.reset_index())
candidate_df = candidate_df.sort_values(['gwas_source', 'snp', 'pqtl_gene', 'z-dwpc'], ascending=[True, True, True, False])
candidate_df.head(2)

Unnamed: 0,snp,pqtl_gene,pqtl_entrez_id,neg_log10_p,gwas_source,locus_gene,locus_entrez_id,distance_to_snp,hetionet-v1.0,hetionet-v1.0-perm-1,hetionet-v1.0-perm-2,hetionet-v1.0-perm-3,hetionet-v1.0-perm-4,hetionet-v1.0-perm-5,r-dwpc,z-dwpc
52332,rs1003645,CCL23,6368,19.64,10.1101/086793,TBC1D3B,414059,152437,3.310107,0.0,0.0,0.0,0.0,0.0,3.310107,inf
52336,rs1003645,CCL23,6368,19.64,10.1101/086793,TBC1D3B,414060,152437,3.310107,0.0,0.0,0.0,0.0,0.0,3.310107,inf


In [14]:
candidate_df.to_csv('canidates-GpBPpG.tsv', sep='\t', index=False)

## Potential candidates of interest

In [15]:
groups = candidate_df.groupby(['gwas_source', 'pqtl_gene'])
groups = [df for i, df in groups]

In [16]:
groups[2]

Unnamed: 0,snp,pqtl_gene,pqtl_entrez_id,neg_log10_p,gwas_source,locus_gene,locus_entrez_id,distance_to_snp,hetionet-v1.0,hetionet-v1.0-perm-1,hetionet-v1.0-perm-2,hetionet-v1.0-perm-3,hetionet-v1.0-perm-4,hetionet-v1.0-perm-5,r-dwpc,z-dwpc
50893,rs3118663,ADAMTS13,11093,19.858,10.1101/086793,GBGT1,26301,252688,4.026301,0.0,0.0,1.486983,1.495825,0.0,3.429739,4.198576
50885,rs3118663,ADAMTS13,11093,19.858,10.1101/086793,ABO,28,141419,4.277619,1.846876,0.0,2.14327,0.0,0.0,3.479589,3.169718
50897,rs3118663,ADAMTS13,11093,19.858,10.1101/086793,VAV2,7410,345261,3.313591,2.906062,2.999959,1.48432,1.5839,2.375942,1.043554,1.462464
50905,rs3118663,ADAMTS13,11093,19.858,10.1101/086793,TSC1,7248,472000,2.807188,1.924093,1.903773,3.016568,1.476603,2.42013,0.658955,1.11823
50887,rs3118663,ADAMTS13,11093,19.858,10.1101/086793,DBH,1621,219727,1.627891,2.347237,0.0,0.985311,0.0,0.0,0.961381,0.931671
50891,rs3118663,ADAMTS13,11093,19.858,10.1101/086793,SARDH,1757,246927,1.78073,0.0,1.614069,0.0,0.0,2.279476,1.002021,0.917656
50875,rs3118663,ADAMTS13,11093,19.858,10.1101/086793,RPL7A,6130,65207,1.976077,0.0,2.193117,1.864113,1.042933,1.913061,0.573432,0.641605
50865,rs3118663,ADAMTS13,11093,19.858,10.1101/086793,SLC2A6,11182,54462,0.0,0.0,0.0,0.0,1.705636,0.0,-0.341127,-0.447214
50881,rs3118663,ADAMTS13,11093,19.858,10.1101/086793,ADAMTSL2,9719,115531,0.0,0.0,0.0,2.082032,0.0,0.0,-0.416406,-0.447214
50913,rs3118663,ADAMTS13,11093,19.858,10.1101/086793,WDR5,11091,718732,0.0,0.0,1.537629,0.0,0.0,0.0,-0.307526,-0.447214


In [17]:
groups[7]

Unnamed: 0,snp,pqtl_gene,pqtl_entrez_id,neg_log10_p,gwas_source,locus_gene,locus_entrez_id,distance_to_snp,hetionet-v1.0,hetionet-v1.0-perm-1,hetionet-v1.0-perm-2,hetionet-v1.0-perm-3,hetionet-v1.0-perm-4,hetionet-v1.0-perm-5,r-dwpc,z-dwpc
1258,rs2518136,AHSG,197,38.778,10.1101/086793,KNG1,3827,97238,3.852442,2.927596,1.937343,2.096135,1.996018,1.610039,1.739016,3.547242
1267,rs2518136,AHSG,197,38.778,10.1101/086793,ADIPOQ,9370,222652,3.554057,2.537769,2.337484,3.011258,3.290958,2.72237,0.774089,2.046353
1243,rs2518136,AHSG,197,38.778,10.1101/086793,FETUB,26998,15931,4.200453,0.0,0.0,3.022197,2.523667,0.0,3.09128,2.021778
1285,rs2518136,AHSG,197,38.778,10.1101/086793,MASP1,5648,597247,3.593551,0.0,0.0,0.0,1.356417,3.065362,2.709195,2.001889
1249,rs2518136,AHSG,197,38.778,10.1101/086793,HRG,3273,40178,3.537533,3.256477,2.333693,1.718726,1.320958,2.885306,1.234501,1.544658
1261,rs2518136,AHSG,197,38.778,10.1101/086793,EIF4A2,1974,163167,1.274641,1.601668,0.0,0.0,0.0,0.0,0.954307,1.332296
1252,rs2518136,AHSG,197,38.778,10.1101/086793,TBCCD1,55171,49495,0.0,0.0,0.0,0.0,3.087021,0.0,-0.617404,-0.447214
1255,rs2518136,AHSG,197,38.778,10.1101/086793,CRYGS,1427,73336,0.0,0.0,3.319187,0.0,0.0,0.0,-0.663837,-0.447214
1276,rs2518136,AHSG,197,38.778,10.1101/086793,RPL39L,116832,500910,0.0,0.0,0.0,0.0,1.752841,0.0,-0.350568,-0.447214
1291,rs2518136,AHSG,197,38.778,10.1101/086793,RTP4,64108,748294,0.0,0.0,0.0,0.0,0.0,1.697639,-0.339528,-0.447214


In [18]:
groups[8]

Unnamed: 0,snp,pqtl_gene,pqtl_entrez_id,neg_log10_p,gwas_source,locus_gene,locus_entrez_id,distance_to_snp,hetionet-v1.0,hetionet-v1.0-perm-1,hetionet-v1.0-perm-2,hetionet-v1.0-perm-3,hetionet-v1.0-perm-4,hetionet-v1.0-perm-5,r-dwpc,z-dwpc
38188,rs6662572,AKR1A1,10327,12.485,10.1101/086793,HPDL,84842,352607,2.563024,0.0,0.0,0.0,0.0,0.0,2.563024,inf
38202,rs6662572,AKR1A1,10327,12.485,10.1101/086793,POMGNT1,55624,507542,2.839443,0.0,0.0,0.0,0.0,0.0,2.839443,inf
38174,rs6662572,AKR1A1,10327,12.485,10.1101/086793,MMACHC,25974,167758,5.955812,1.562045,2.070967,0.0,0.0,0.0,5.229209,5.171902
38226,rs6662572,AKR1A1,10327,12.485,10.1101/086793,FAAH,2166,713125,4.241539,0.0,0.0,3.484074,1.61545,1.890655,2.843503,1.944857
81565,rs6662572,AKR1A1,10327,12.485,10.1101/086793,TEX38,374973,987715,1.497775,0.0,0.0,0.0,1.621972,0.0,1.17338,1.617635
38192,rs6662572,AKR1A1,10327,12.485,10.1101/086793,PIK3R3,8503,359000,2.069552,1.100919,0.0,0.0,2.547846,1.461883,1.047422,0.974937
38186,rs6662572,AKR1A1,10327,12.485,10.1101/086793,MUTYH,4595,340670,1.78225,0.0,2.445538,0.0,1.363455,2.357242,0.549003,0.456246
38194,rs6662572,AKR1A1,10327,12.485,10.1101/086793,UROD,7389,665565,1.119655,1.084041,0.0,2.023879,1.579135,0.0,0.182244,0.198514
38224,rs6662572,AKR1A1,10327,12.485,10.1101/086793,RPS8,6202,902361,0.937233,1.881014,0.0,0.0,0.0,2.187382,0.123554,0.110372
38210,rs6662572,AKR1A1,10327,12.485,10.1101/086793,UQCRH,7388,622491,1.700625,1.959989,0.0,2.002919,2.160267,2.437343,-0.011479,-0.011771
