In [1]:
import pandas as pd
import numpy as np
import json
import os
from tqdm.notebook import tqdm

In [2]:
# disease = "asthma"
K_values = [5, 10, 20, 50, 100]
alpha_array = [0.1, 0.2, 0.3, 0.5, 0.7, 0.9]

In [38]:
# for disease in ["asthma", "autism", "schizophrenia"]:
#     with open("processed_data/gwas_catalog_targets/{}_targets_gene2ncbi.json".format(disease), "r") as f:
#         catalog_targets_gene2ncbi = json.load(f)
#     ncbi_targets = list(catalog_targets_gene2ncbi.values())
#     print(disease, len(ncbi_targets))

In [3]:
def precision_at_k(targets, predictions, K=None):
    if K is not None:
        predictions = predictions[:K]
        denom = K
    else:
        denom = len(predictions)
    num = len(set(targets).intersection(set(predictions)))
    return num/denom
        
def recall_at_k(targets, predictions, K=None):
    if K is not None:
        predictions = predictions[:K]
    num = len(set(targets).intersection(set(predictions)))
    return num/len(targets)
        
def average_precision_at_k(targets, predictions, K=None):
    pak = []
    for pk in range(1, K+1):
        pak.append(precision_at_k(targets, predictions, pk))
    return np.mean(pak)



# Gene Scores

In [4]:
folder = "outputs/RWRs_scores/"
results = []
for disease in ["asthma", "autism", "schizophrenia"]:
    # Load seeds
    with open("processed_data/gene_seeds/{}_seeds_gene2ncbi.json".format(disease), "r") as f:
        disease_seeds_gene2ncbi = json.load(f)
    gene_seeds = list(disease_seeds_gene2ncbi.values())
    # disease_seeds_gene2ncbi
    
    # Load targets
    with open("processed_data/gwas_catalog_targets/{}_targets_gene2ncbi.json".format(disease), "r") as f:
        catalog_targets_gene2ncbi = json.load(f)
    ncbi_targets = list(catalog_targets_gene2ncbi.values())
    print(len(ncbi_targets))
    
    flist = sorted(f for f in os.listdir(folder) if disease in f)
    for fname in tqdm(flist):
        if disease not in fname:
            continue
    #     print(fname)
        d, network, a, _ = fname.split("_")
        alpha = float(a.replace("alpha", ""))

        data = pd.read_csv(os.path.join(folder, fname))
        if data.isna().any().any():
            print(fname)
            break
        genes = [str(s) for s in data["Gene NCBI ID"] if str(s) not in gene_seeds] #data[col].astype(str)
        for k in K_values:
            pak = precision_at_k(ncbi_targets, genes, k)
#             rak = recall_at_k(ncbi_targets, genes, k)
            apk = average_precision_at_k(ncbi_targets, genes, k)
            results.append({"Network": network, "Alpha": alpha, "Metric": "Precision", "K": k, "Value": pak, "Method": "Score", "Disease": disease})
#             results.append({"Network": network, "Alpha": alpha, "Metric": "Recall", "K": k, "Value": rak, "Method": "Score", "Disease": disease})
            results.append({"Network": network, "Alpha": alpha, "Metric": "Average Precision", "K": k, "Value": apk, "Method": "Score", "Disease": disease})
            
results = pd.DataFrame(results)
results.to_csv("outputs/results_RWRs/RWRs_gene_scores_single_nets.csv", index=False)
results

896


  0%|          | 0/30 [00:00<?, ?it/s]

552


  0%|          | 0/30 [00:00<?, ?it/s]

439


  0%|          | 0/30 [00:00<?, ?it/s]

Unnamed: 0,Network,Alpha,Metric,K,Value,Method,Disease
0,BioPlex3,0.1,Precision,5,0.800000,Score,asthma
1,BioPlex3,0.1,Average Precision,5,0.910000,Score,asthma
2,BioPlex3,0.1,Precision,10,0.700000,Score,asthma
3,BioPlex3,0.1,Average Precision,10,0.821429,Score,asthma
4,BioPlex3,0.1,Precision,20,0.700000,Score,asthma
...,...,...,...,...,...,...,...
895,STRING,0.9,Average Precision,20,0.010851,Score,schizophrenia
896,STRING,0.9,Precision,50,0.040000,Score,schizophrenia
897,STRING,0.9,Average Precision,50,0.024020,Score,schizophrenia
898,STRING,0.9,Precision,100,0.030000,Score,schizophrenia


# Seed Genes

In [11]:
folder = "outputs/RWRs_gene_seeds/"
results = []
for disease in ["asthma", "autism", "schizophrenia"]:
    # Load seeds
    with open("processed_data/gene_seeds/{}_seeds_gene2ncbi.json".format(disease), "r") as f:
        disease_seeds_gene2ncbi = json.load(f)
    gene_seeds = list(disease_seeds_gene2ncbi.values())
    # disease_seeds_gene2ncbi
    
    # Load targets
    with open("processed_data/gwas_catalog_targets/{}_targets_gene2ncbi.json".format(disease), "r") as f:
        catalog_targets_gene2ncbi = json.load(f)
    ncbi_targets = list(catalog_targets_gene2ncbi.values())
    print(len(ncbi_targets))
    
    flist = sorted(f for f in os.listdir(folder) if disease in f)
    for fname in tqdm(flist):
        if disease not in fname:
            continue
    #     print(fname)
        d, network, a, _ = fname.split("_")
        alpha = float(a.replace("alpha", ""))

        data = pd.read_csv(os.path.join(folder, fname))
        genes = [str(s) for s in data["Gene NCBI ID"] if str(s) not in gene_seeds]
        if data.isna().any().any():
            genes = []
            print(fname)
#         genes = [str(s) for s in data["Gene NCBI ID"] if str(s) not in gene_seeds] #data[col].astype(str)
        for k in K_values:
            pak = precision_at_k(ncbi_targets, genes, k)
#             rak = recall_at_k(ncbi_targets, genes, k)
            apk = average_precision_at_k(ncbi_targets, genes, k)
            results.append({"Network": network, "Alpha": alpha, "Metric": "Precision", "K": k, "Value": pak, "Method": "Seed Genes", "Disease": disease})
#             results.append({"Network": network, "Alpha": alpha, "Metric": "Recall", "K": k, "Value": rak, "Method": "Score", "Disease": disease})
            results.append({"Network": network, "Alpha": alpha, "Metric": "Average Precision", "K": k, "Value": apk, "Method": "Seed Genes", "Disease": disease})
            
results = pd.DataFrame(results)
results.to_csv("outputs/results_RWRs/RWRs_seed_genes_single_nets.csv", index=False)
results

896


  0%|          | 0/30 [00:00<?, ?it/s]

552


  0%|          | 0/30 [00:00<?, ?it/s]

autism_ProteomeHD_alpha0.1_results.csv
autism_ProteomeHD_alpha0.2_results.csv
autism_ProteomeHD_alpha0.3_results.csv
autism_ProteomeHD_alpha0.5_results.csv
autism_ProteomeHD_alpha0.7_results.csv
autism_ProteomeHD_alpha0.9_results.csv
439


  0%|          | 0/30 [00:00<?, ?it/s]

Unnamed: 0,Network,Alpha,Metric,K,Value,Method,Disease
0,BioPlex3,0.1,Precision,5,0.800000,Seed Genes,asthma
1,BioPlex3,0.1,Average Precision,5,0.843333,Seed Genes,asthma
2,BioPlex3,0.1,Precision,10,0.700000,Seed Genes,asthma
3,BioPlex3,0.1,Average Precision,10,0.799206,Seed Genes,asthma
4,BioPlex3,0.1,Precision,20,0.450000,Seed Genes,asthma
...,...,...,...,...,...,...,...
895,STRING,0.9,Average Precision,20,0.385494,Seed Genes,schizophrenia
896,STRING,0.9,Precision,50,0.100000,Seed Genes,schizophrenia
897,STRING,0.9,Average Precision,50,0.229390,Seed Genes,schizophrenia
898,STRING,0.9,Precision,100,0.140000,Seed Genes,schizophrenia


In [7]:
data.isna().any().any()

False

In [8]:
# results[(results["Network"]=="ProteomeHD") & (results["Disease"]=="autism")]