In [1]:
import pandas as pd
import numpy as np
import json
import os
from tqdm.notebook import tqdm

In [2]:
# disease = "asthma"
K_values = [5, 10, 20, 50, 100]
alpha_array = [0.1, 0.2, 0.3, 0.5, 0.7, 0.9]

In [3]:
# for disease in ["asthma", "autism", "schizophrenia"]:
#     with open("processed_data/gwas_catalog_targets/{}_targets_gene2ncbi.json".format(disease), "r") as f:
#         catalog_targets_gene2ncbi = json.load(f)
#     ncbi_targets = list(catalog_targets_gene2ncbi.values())
#     print(disease, len(ncbi_targets))

In [4]:
def precision_at_k(targets, predictions, K=None):
    if K is not None:
        predictions = predictions[:K]
        denom = K
    else:
        denom = len(predictions)
    num = len(set(targets).intersection(set(predictions)))
    return num/denom
        
def recall_at_k(targets, predictions, K=None):
    if K is not None:
        predictions = predictions[:K]
    num = len(set(targets).intersection(set(predictions)))
    return num/len(targets)
        
def average_precision_at_k(targets, predictions, K=None):
    pak = []
    for pk in range(1, K+1):
        pak.append(precision_at_k(targets, predictions, pk))
#     print(pak)
    return np.mean(pak)


# Multiplex RWR - Gene scores (keep first)

In [5]:
folder = "outputs/MultiplexRWRs_scores/"
results = []
for disease in ["asthma", "autism", "schizophrenia"]:
    # Load seeds
    with open("processed_data/gene_seeds/{}_seeds_gene2ncbi.json".format(disease), "r") as f:
        disease_seeds_gene2ncbi = json.load(f)
    gene_seeds = list(disease_seeds_gene2ncbi.values())
    # disease_seeds_gene2ncbi
    
    # Load targets
    with open("processed_data/gwas_catalog_targets/{}_targets_gene2ncbi.json".format(disease), "r") as f:
        catalog_targets_gene2ncbi = json.load(f)
    ncbi_targets = list(catalog_targets_gene2ncbi.values())
    print(len(ncbi_targets))
    
    flist = sorted(f for f in os.listdir(folder) if disease in f)
    for fname in tqdm(flist):
#         if "avg_rank" not in fname:
#             continue
    #     print(fname)
        ln = fname.split("_")
        a = ln[2]
        alpha = float(a.replace("alpha", ""))

        data = pd.read_csv(os.path.join(folder, fname))
        if data.isna().any().any():
            print(fname)
            break
        genes = [str(s) for s in data["Gene NCBI ID"] if str(s) not in gene_seeds] #data[col].astype(str)
        for k in K_values:
#             print(k)
            pak = precision_at_k(ncbi_targets, genes, k)
#             rak = recall_at_k(ncbi_targets, genes, k)
            apk = average_precision_at_k(ncbi_targets, genes, k)
            results.append({"Alpha": alpha, "Metric": "Precision", "K": k, "Value": pak, "Method": "Multiplex RWR", "Disease": disease})
#             results.append({"Network": network, "Alpha": alpha, "Metric": "Recall", "K": k, "Value": rak, "Method": "Score", "Disease": disease})
            results.append({"Alpha": alpha, "Metric": "Average Precision", "K": k, "Value": apk, "Method": "Multiplex RWR", "Disease": disease})
#             break
#     break
results = pd.DataFrame(results)
results.to_csv("outputs/results_MultiplexRWRs/MultiplexRWR_gene_scores.csv", index=False)
results

896


  0%|          | 0/6 [00:00<?, ?it/s]

552


  0%|          | 0/6 [00:00<?, ?it/s]

439


  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,Alpha,Metric,K,Value,Method,Disease
0,0.1,Precision,5,0.800000,Multiplex RWR,asthma
1,0.1,Average Precision,5,0.960000,Multiplex RWR,asthma
2,0.1,Precision,10,0.700000,Multiplex RWR,asthma
3,0.1,Average Precision,10,0.829762,Multiplex RWR,asthma
4,0.1,Precision,20,0.650000,Multiplex RWR,asthma
...,...,...,...,...,...,...
175,0.9,Average Precision,20,0.000000,Multiplex RWR,schizophrenia
176,0.9,Precision,50,0.000000,Multiplex RWR,schizophrenia
177,0.9,Average Precision,50,0.000000,Multiplex RWR,schizophrenia
178,0.9,Precision,100,0.000000,Multiplex RWR,schizophrenia


In [9]:
fname

'asthma_multiplex_alpha0.1_results.csv'