In [1]:
import pandas as pd
import os.path
import numpy as np
import PIL
import glob
from PIL import Image

In [2]:
os.chdir('/Users/KevinBu/Desktop/clemente_lab/CUTIE/')
stat_df = pd.read_csv('scripts/stat_dict_real.txt', sep = '\t')
rev_stat_df = pd.read_csv('scripts/rev_stat_dict_real.txt', sep = '\t')
stat_df.head()

Unnamed: 0,statistic,lungc,hdac,who
0,pearson,lungc_kpc,hdac_kpc,WHO_kpc
1,spearman,lungc_ksc,hdac_ksc,WHO_ksc
2,kendall,lungc_kkc,hdac_kkc,WHO_kkc
3,mine,lungc_mine,hdac_mine,WHO_mine


In [3]:

headers = [
    'analysis_id',
    'distribution',
    'statistic',
    'defaulted', # binary
    'initial_corr',
    'true_corr(TP_FN)',
    'false_corr(FP_TN)',
    'rs_true_corr_TP_FN',
    'rs_false_corr_FP_TN',
    'runtime'
]

    
def parse_log(f):
    lines = [l.strip() for l in f.readlines()]
    defaulted = False
    # check if FDR correction defaulted
    for l in lines:
        if "defaulted" in l: 
            defaulted = True
        elif "initial_corr" in l:
            initial_corr = float(l.split(' ')[-1])
        elif "false correlations" in l:
            false_corr = float(l.split(' ')[-1])
        elif "true correlations" in l:
            true_corr = float(l.split(' ')[-1])
        elif "FP/TN1" in l:
            rs_false = float(l.split(' ')[-1])
        elif "TP/FN1" in l:
            rs_true = float(l.split(' ')[-1])
        elif "runtime" in l:
            runtime = float(l.split(' ')[-1])
    return defaulted, initial_corr, false_corr, true_corr, rs_false, rs_true, runtime


m_to_stat = {
    'kpc': 'pearson',
    'ksc': 'spearman',
    'kkc': 'kendall',
    'mine': 'mine'        
}
for_df = pd.DataFrame()

for dataset in stat_df.columns.values[1:4]:
    # statistic = 'pearson'
    subset_df = stat_df.set_index('statistic').loc[:,dataset]
    rev_subset_df = rev_stat_df.set_index('statistic').loc[:,dataset]
        
    # for each distribution
#    for dist in distributions:
 #       # dist = 'sim_copula_n50_norm_0_1'
        
    for m in subset_df:
        # m = 'kpc'
        path = '../Submissions/CUtIe/final_data_fixed/cutie_' + m + '1fdr0.05/'
        analysis_id = str('cutie_' + m + '1fdr0.05')
        files = sorted(glob.glob(path + '*.txt'))
        # grab most recent log file
        rel_logfile = files[-1]
        with open(rel_logfile, 'r') as f:
            try: 
                defaulted, initial_corr, false_corr, true_corr, rs_false, rs_true, runtime = parse_log(f)

                new_row = pd.DataFrame([[analysis_id, dataset, m_to_stat[m.split('_')[-1]], 
                                        defaulted, initial_corr, true_corr, false_corr, rs_true, rs_false, runtime]], columns=headers)

                for_df = for_df.append(new_row)
            except: 
                print(analysis_id)
                


In [4]:
for_df.head()

Unnamed: 0,analysis_id,distribution,statistic,defaulted,initial_corr,true_corr(TP_FN),false_corr(FP_TN),rs_true_corr_TP_FN,rs_false_corr_FP_TN,runtime
0,cutie_lungc_kpc1fdr0.05,lungc,pearson,False,15458.0,1098.0,14360.0,0.0,11396.0,1079.91
0,cutie_lungc_ksc1fdr0.05,lungc,spearman,False,28304.0,9092.0,19212.0,0.0,13542.0,3723.45
0,cutie_lungc_kkc1fdr0.05,lungc,kendall,False,27636.0,8796.0,18840.0,0.0,13240.0,2245.4
0,cutie_lungc_mine1fdr0.05,lungc,mine,False,580.0,496.0,84.0,0.0,0.0,676.08
0,cutie_hdac_kpc1fdr0.05,hdac,pearson,False,4122.0,3470.0,652.0,0.0,0.0,599.0


In [5]:
for_df.to_csv('../Submissions/CUtIe/final_data_fixed/final_real_df.txt', sep = '\t', index = False)

In [6]:
header = [
    'Microbiome',
    'Gene Expression',
    'WHO'
]

index = [
    'Pearson',
    '% initial sig',
    'Spearman',
    '% initial sig',
    'Kendall',
    '% initial sig',
    'MIC',
    '% initial sig',

]
dists = ['lungc', 'hdac', 'who'] 
method = ['cutie', 'jackknife', 'bootstrap', 'cookd']
dist_to_corr = {
    'lungc': 748 * 747,
    'hdac': 100 * 100,
    'who': 354 * 353
}

df_array = []
for stat in stat_df['statistic']:
    # stat = 'Pearson'
    row_fracs = []
    for dist in dists:
        row = for_df[(for_df['distribution'] == dist) & (for_df['statistic'] == stat)] 
        row_fracs.append(float(row['true_corr(TP_FN)'] /row['initial_corr'].values)) # correctly id tp
    df_array.append(row_fracs)
    
    initial_sig_fracs = []
    for dist in dists:
        row = for_df[(for_df['distribution'] == dist) & (for_df['statistic'] == stat)] 
        # change number 249500 to n_corr depending on dataset
        initial_sig_fracs.append(float(row['initial_corr'] / dist_to_corr[dist]))
    df_array.append(initial_sig_fracs)
    
new_df = pd.DataFrame(data = df_array, index = index, columns = header) 
new_df = new_df.rename_axis('Statistic')
#new_df



In [7]:
new_df.to_csv('../Submissions/CUtIe/final_data_fixed/real_tpfp.csv', index = True)

In [8]:

headers = [
    'analysis_id',
    'distribution',
    'statistic',
    'defaulted', # binary
    'initial_corr',
    'true_corr(TP_FN)',
    'false_corr(FP_TN)',
    'rs_true_corr_TP_FN',
    'rs_false_corr_FP_TN',
    'runtime'
]

    
def parse_log(f):
    lines = [l.strip() for l in f.readlines()]
    defaulted = False
    # check if FDR correction defaulted
    for l in lines:
        if "defaulted" in l: 
            defaulted = True
        elif "initial_corr" in l:
            initial_corr = float(l.split(' ')[-1])
        elif "false correlations" in l:
            false_corr = float(l.split(' ')[-1])
        elif "true correlations" in l:
            true_corr = float(l.split(' ')[-1])
        elif "FP/TN1" in l:
            rs_false = float(l.split(' ')[-1])
        elif "TP/FN1" in l:
            rs_true = float(l.split(' ')[-1])
        elif "runtime" in l:
            runtime = float(l.split(' ')[-1])
            
    return defaulted, initial_corr, false_corr, true_corr, rs_false, rs_true, runtime


m_to_stat = {
    'rpc': 'pearson',
    'rsc': 'spearman',
    'rkc': 'kendall',
    'rmine': 'mine'        
}
rev_df = pd.DataFrame()

for dataset in rev_stat_df.columns.values[1:4]:
    # statistic = 'pearson'
    subset_df = rev_stat_df.set_index('statistic').loc[:,dataset]
    rev_subset_df = rev_stat_df.set_index('statistic').loc[:,dataset]
        
    # for each distribution
#    for dist in distributions:
 #       # dist = 'sim_copula_n50_norm_0_1'
        
    for m in rev_subset_df:
        # m = 'kpc'
        path = '../Submissions/CUtIe/final_data_fixed/cutie_' + m + '1fdr0.05/'
        analysis_id = str('cutie_' + m + '1fdr0.05')
        files = sorted(glob.glob(path + '*.txt'))
        # grab most recent log file
        rel_logfile = files[-1]
        with open(rel_logfile, 'r') as f:
            try: 
                defaulted, initial_corr, false_corr, true_corr, rs_false, rs_true, runtime = parse_log(f)

                new_row = pd.DataFrame([[analysis_id, dataset, m_to_stat[m.split('_')[-1]], 
                                        defaulted, initial_corr, true_corr, false_corr, rs_true, rs_false, runtime]], columns=headers)

                rev_df = rev_df.append(new_row)
            except: 
                print(analysis_id)
                


In [9]:
rev_df.head()

Unnamed: 0,analysis_id,distribution,statistic,defaulted,initial_corr,true_corr(TP_FN),false_corr(FP_TN),rs_true_corr_TP_FN,rs_false_corr_FP_TN,runtime
0,cutie_lungc_rpc1fdr0.05,lungc,pearson,False,543298.0,13260.0,530038.0,297502.0,10142.0,31306.66
0,cutie_lungc_rsc1fdr0.05,lungc,spearman,False,530452.0,9524.0,520928.0,318706.0,4970.0,61934.97
0,cutie_lungc_rkc1fdr0.05,lungc,kendall,False,531120.0,9314.0,521806.0,319334.0,4866.0,38631.15
0,cutie_lungc_rmine1fdr0.05,lungc,mine,False,558176.0,338.0,557838.0,252450.0,0.0,51888.13
0,cutie_hdac_rpc1fdr0.05,hdac,pearson,False,5778.0,768.0,5010.0,1306.0,6.0,853.05


In [24]:
rev_df.to_csv('../Submissions/CUtIe/final_data_fixed/final_real_df_rev.txt', sep = '\t', index = False)

In [25]:
header = [
    'Microbiome',
    'Gene Expression',
    'WHO'
]

index = [
    'Pearson',
    '% initial insig',
    'Spearman',
    '% initial insig',
    'Kendall',
    '% initial insig',
    'MIC',
    '% initial insig',

]
dists = ['lungc', 'hdac', 'who'] 
method = ['cutie', 'jackknife', 'bootstrap', 'cookd']
dist_to_corr = {
    'lungc': 748 * 747,
    'hdac': 100 * 100,
    'who': 354 * 353
}

df_array = []
for stat in rev_stat_df['statistic']:
    # stat = 'Pearson'
    row_fracs = []
    for dist in dists:
        row = rev_df[(rev_df['distribution'] == dist) & (rev_df['statistic'] == stat)] 
        row_fracs.append(float(row['true_corr(TP_FN)'] /row['initial_corr'].values)) # correctly id tp
    df_array.append(row_fracs)
    
    initial_sig_fracs = []
    for dist in dists:
        row = rev_df[(rev_df['distribution'] == dist) & (rev_df['statistic'] == stat)] 
        # change number 249500 to n_corr depending on dataset
        initial_sig_fracs.append(float(row['initial_corr'] / dist_to_corr[dist]))
    df_array.append(initial_sig_fracs)
    
new_df = pd.DataFrame(data = df_array, index = index, columns = header) 
new_df = new_df.rename_axis('Statistic')
#new_df



In [26]:
new_df.to_csv('../Submissions/CUtIe/final_data_fixed/real_tnfn.csv', index = True)