In [1]:
# Set thread count for network propagation functions
import mkl
mkl.set_num_threads(4)

# Import other required packages
import os
import pandas as pd

# Propagating GWAS Hits from WTCCC

In [2]:
# Load functions for performing network GWAS from python module file
%run './network_GWAS_functions.py'

In [3]:
# Load network for propagation
network = load_network('./Data/PCNet.txt', delimiter='\t')

Network File Loaded: 18.7298469543 seconds
Number of network nodes: 19781
Number of network edges: 2724724


In [4]:
# Load example GWAS-based gene-disease associations from file as pandas Series
wd = './Data/'
diseases, GWAS_Associations = [], {}
for fn in os.listdir(wd):
    if fn.endswith('GWAS_Associations.csv'):
        disease_name = fn.split('_')[0]
        diseases.append(disease_name)
        GWAS_Association_table = pd.read_csv(wd+fn, index_col=0, header=-1)
        GWAS_Associations[disease_name] = GWAS_Association_table[1]
        GWAS_Associations[disease_name].name = 'GWAS P'

In [11]:
p_thresh = '1e-5'

In [12]:
# Perform random walk propagation for each disease
save_dir = './Results/'
for disease in diseases:
    propagation_results_table = random_walk(network, GWAS_Associations[disease], p_thresh=1e-5)
    propagation_results_table.to_csv(save_dir+disease+'_p'+p_thresh+'_prop_results.csv')
    print

Significant genes to be propagated: Index([u'CHD9', u'DUSP26', u'POU3F3', u'GPD1L', u'TDRD9', u'CDC25B',
       u'ATP6V1G1', u'RNPEPL1', u'LAMP3', u'C14orf37', u'DPP10', u'CAPN6',
       u'TBCC', u'LPIN1', u'NFIA'],
      dtype='object', name=0)
Calculated Alpha: 0.56
Adjacency matrix normalized
Network propagation seeds mapped
Network propagation complete: 23 iterations




Combined network propagation results constructed
Significant genes to be propagated: Index([u'IL2', u'RASA1', u'DEXI', u'IL2RA', u'ENAH', u'HLA-DQA1', u'PTPN2',
       u'SOCS1', u'ERBB3', u'PHTF1', u'HSPA4'],
      dtype='object', name=0)
Calculated Alpha: 0.56
Adjacency matrix normalized
Network propagation seeds mapped
Network propagation complete: 23 iterations
Combined network propagation results constructed
Significant genes to be propagated: Index([u'BCAT1', u'PCDH9', u'CHRM3', u'COL22A1', u'NR2F2', u'CHPT1'], dtype='object', name=0)
Calculated Alpha: 0.56
Adjacency matrix normalized
Network propagation seeds mapped
Network propagation complete: 24 iterations
Combined network propagation results constructed
Significant genes to be propagated: Index([u'IL2RB', u'ANAPC4', u'TNFAIP3', u'PODXL', u'IL2RA', u'HLA-DQA2',
       u'GJB6', u'PHTF1', u'BACE2'],
      dtype='object', name=0)
Calculated Alpha: 0.56
Adjacency matrix normalized
Network propagation seeds mapped
Network propagati

# Evaluation of Propagation Results vs Disease Gold Standard

In [13]:
# Load gold standard gene sets
f = open('./Data/WTCCC_DisGeNET_Gold_Standards.txt')
lines = f.read().splitlines()
gold_std_disease_geneset = {}
for line in lines:
    line_split = line.split('\t')
    gold_std_disease_geneset[line_split[0]] = line_split[1:]

In [45]:
# Load functions for evaluating network GWAS results
%run './network_GWAS_results_analysis_functions.py'

In [42]:
# Load propagation results and evaluate it against disease gold standard
GWAS_AUPRCs, prop_AUPRCs, adjp_AUPRCs = {}, {}, {}

for disease in diseases:
    # Load propagation results table
    results_table = pd.read_csv(save_dir+disease+'_p'+p_thresh+'_prop_results.csv', index_col=0)
    # Perform precision/recall calculations
    GWAS_AUPRC, prop_AUPRC, adjp_AUPRC = PRC_plots(results_table, gold_std_disease_geneset[disease], p_thresh=p_thresh, 
                                                   outdir=save_dir, plot_title=disease+' (Propagation Threshold: p <= '+p_thresh+')',
                                                   file_prefix = disease+'_p'+p_thresh)
    # Record AUPRC results
    GWAS_AUPRCs[disease] = GWAS_AUPRC
    prop_AUPRCs[disease] = prop_AUPRC
    adjp_AUPRCs[disease] = adjp_AUPRC

In [47]:
weak_p_thresh='1e-4'

In [48]:
# Load propagation results and visualize propagation result distributions
for disease in diseases:
    # Load propagation results table
    results_table = pd.read_csv(save_dir+disease+'_p'+p_thresh+'_prop_results.csv', index_col=0)
    # Perform precision/recall calculations
    prop_score_distributions(results_table, gold_std_disease_geneset[disease], p_thresh=p_thresh, weak_p_thresh=weak_p_thresh,
                             outdir=save_dir, plot_title=disease+' (Propagation Threshold: p <= '+p_thresh+')',
                             file_prefix = disease+'_p'+p_thresh)