In [1]:
import pandas as pd
import os.path
import numpy as np
import PIL
import glob
from PIL import Image

In [2]:
os.chdir('/Users/KevinBu/Desktop/clemente_lab/CUTIE/')
stat_df = pd.read_csv('scripts/stat_dict.txt', sep = '\t')
rev_stat_df = pd.read_csv('scripts/rev_stat_dict.txt', sep = '\t')
stat_df.head()

Unnamed: 0,statistic,cutie,jackknifing,bootstrapping,cooks_d
0,pearson,kpc,jkp,bsp,pointwise_kpc
1,spearman,ksc,jks,bss,pointwise_ksc
2,kendall,kkc,jkk,bsk,pointwise_kkc
3,mine,mine,jkm,bsm,pointwise_mine


In [3]:
distributions = ['sim_copula_n50_norm_0_1', 
                 'sim_copula_n50_gamma_1_0_100',
                 'sim_copula_n50_lognorm_3_0',
                 'sim_zero_infl_otu_copula_n50_lognorm_3_0']

headers = [
    'analysis_id',
    'distribution',
    'statistic',
    'method',
    'cookd', # binary
    'defaulted', # binary
    'initial_corr',
    'true_corr(TP_FN)',
    'false_corr(FP_TN)',
    'rs_true_corr_TP_FN',
    'rs_false_corr_FP_TN'
]

    
    
def parse_log(f, cookd):
    lines = [l.strip() for l in f.readlines()]
    defaulted = False
    if cookd:
        for l in lines:
            if "defaulted" in l: 
                defaulted = True
            elif "initial_corr" in l:
                initial_corr = float(l.split(' ')[-1])
            elif "false correlations according to cookd" in l:
                false_corr = float(l.split(' ')[-1])
            elif "true correlations according to cookd" in l:
                true_corr = float(l.split(' ')[-1])
        rs_false = np.nan
        rs_true = np.nan
        
    else:
        # check if FDR correction defaulted
        for l in lines:
            if "defaulted" in l: 
                defaulted = True
            elif "initial_corr" in l:
                initial_corr = float(l.split(' ')[-1])
            elif "false correlations" in l:
                false_corr = float(l.split(' ')[-1])
            elif "true correlations" in l:
                true_corr = float(l.split(' ')[-1])
            elif "FP/TN1" in l:
                rs_false = float(l.split(' ')[-1])
            elif "TP/FN1" in l:
                rs_true = float(l.split(' ')[-1])
    
    return defaulted, initial_corr, false_corr, true_corr, rs_false, rs_true



dist_to_short = {
  'sim_copula_n50_norm_0_1': 'norm', 
                 'sim_copula_n50_gamma_1_0_100': 'gamma',
                 'sim_copula_n50_lognorm_3_0': 'lognorm',
                 'sim_zero_infl_otu_copula_n50_lognorm_3_0': 'zi-lognorm'
}


for_df = pd.DataFrame()
# for each statistic
for statistic in stat_df['statistic'].values:
    # statistic = 'pearson'
    subset_df = stat_df.set_index('statistic').loc[statistic,:]
    rev_subset_df = rev_stat_df.set_index('statistic').loc[statistic,:]

    # for each distribution
    for dist in distributions:
        # dist = 'sim_copula_n50_norm_0_1'
        
        for m in subset_df:
            # m = 'kpc'
            path = '../Submissions/CUtIe/final_data_fixed/' + dist + '_' + m + '1fdr0.05/'
            cookd = False
            analysis_id = str(dist + '_' + m + '1fdr0.05')
            if 'pointwise' not in analysis_id:
                files = sorted(glob.glob(path + '*.txt'))
                # grab most recent log file
                rel_logfile = files[-1]
                if 'jk' in m:
                    method = 'jackknife'
                elif 'bs' in m:
                    method = 'bootstrap'
                else:
                    method = 'cutie'
                with open(rel_logfile, 'r') as f:
                    try: 
                        defaulted, initial_corr, false_corr, true_corr, rs_false, rs_true = parse_log(f, cookd)

                        new_row = pd.DataFrame([[analysis_id, dist_to_short[dist], statistic, method, cookd, 
                                                defaulted, initial_corr, true_corr, false_corr, rs_true, rs_false]], columns=headers)

                        for_df = for_df.append(new_row)
                    except: 
                        print(analysis_id)
                
            if m in ['kpc','ksc','kkc','mine']:
                path = '../Submissions/CUtIe/final_data_fixed/' + dist + '_pointwise_' + m + '1fdr0.05/'
                cookd = True
                analysis_id = str(dist + '_pointwise_' + m + '1fdr0.05')
                files = glob.glob(path + '/*.txt')
                rel_logfile = files[-1]
                with open(rel_logfile, 'r') as f:
                    try:
                        defaulted, initial_corr, false_corr, true_corr, rs_false, rs_true = parse_log(f, cookd)

                        new_row = pd.DataFrame([[analysis_id, dist_to_short[dist], statistic, 'cookd', cookd, 
                                                defaulted, initial_corr, true_corr, false_corr, rs_true, rs_false]], columns=headers)

                        for_df = for_df.append(new_row)
                    except:
                        print(analysis_id, m)
                    

In [4]:
for_df.head()

Unnamed: 0,analysis_id,distribution,statistic,method,cookd,defaulted,initial_corr,true_corr(TP_FN),false_corr(FP_TN),rs_true_corr_TP_FN,rs_false_corr_FP_TN
0,sim_copula_n50_norm_0_1_kpc1fdr0.05,norm,pearson,cutie,False,False,180624.0,168834.0,11790.0,0.0,0.0
0,sim_copula_n50_norm_0_1_pointwise_kpc1fdr0.05,norm,pearson,cookd,True,False,180624.0,180609.0,15.0,,
0,sim_copula_n50_norm_0_1_jkp1fdr0.05,norm,pearson,jackknife,False,False,180624.0,178126.0,2498.0,0.0,0.0
0,sim_copula_n50_norm_0_1_bsp1fdr0.05,norm,pearson,bootstrap,False,False,180624.0,149417.0,31207.0,3017.0,26055.0
0,sim_copula_n50_gamma_1_0_100_kpc1fdr0.05,gamma,pearson,cutie,False,False,167938.0,153294.0,14644.0,0.0,0.0


In [5]:
for_df.to_csv('../Submissions/CUtIe/final_data_fixed/final_sim_df.txt', sep = '\t', index = False)

In [7]:
header = [
    'Norm / CUTIE',
    'Norm / Jackknife',
    'Norm / Bootstrap',
    'Norm / Cook D',
    'Gamma / CUTIE',
    'Gamma / Jackknife',
    'Gamma / Bootstrap',
    'Gamma / Cook D',
    'LogN / CUTIE',
    'LogN / Jackknife',
    'LogN / Bootstrap',
    'LogN / Cook D',
    'ZI-LogN / CUTIE',
    'ZI-LogN / Jackknife',
    'ZI-LogN / Bootstrap',
    'ZI-LogN / Cook D',
]

index = [
    'Pearson',
    '% initial sig',
    'Spearman',
    '% initial sig',
    'Kendall',
    '% initial sig',
    'MIC',
    '% initial sig',

]
dists = ['norm', 'gamma', 'lognorm', 'zi-lognorm']
method = ['cutie', 'jackknife', 'bootstrap', 'cookd']
df_array = []
for stat in stat_df['statistic']:
    # stat = 'Pearson'
    row_fracs = []
    for dist in dists:
        # dist = 'norm'
        for m in range(len(method)):
            # method[m] = 'cutie'
            # (df["B"] > 50) & (df["C"] == 900)
            row = for_df[(for_df['distribution'] == dist) & (for_df['statistic'] == stat) & (for_df['method'] == method[m])] 
            row_fracs.append(float(row['true_corr(TP_FN)'] /row['initial_corr'].values)) # correctly id tp
    df_array.append(row_fracs)
    
    initial_sig_fracs = []
    for dist in dists:
         for m in range(len(method)):
            row = for_df[(for_df['distribution'] == dist) & (for_df['statistic'] == stat) & (for_df['method'] == method[m])] 
            # change number 249500 to n_corr depending on dataset
            initial_sig_fracs.append(float(row['initial_corr'] / 249500))
    df_array.append(initial_sig_fracs)
    
new_df = pd.DataFrame(data = df_array, index = index, columns = header) 
new_df = new_df.rename_axis('Statistic')
#new_df



In [8]:
new_df.to_csv('../Submissions/CUtIe/final_data_fixed/tpfp.csv', index = True)

In [9]:

rev_df = pd.DataFrame()
# for each statistic
for statistic in stat_df['statistic'].values:
    # statistic = 'pearson'
    rev_subset_df = rev_stat_df.set_index('statistic').loc[statistic,:]

    # for each distribution
    for dist in distributions:
        # dist = 'sim_copula_n50_norm_0_1'
        
        for m in rev_subset_df:
            # m = 'kpc'
            path = '../Submissions/CUtIe/final_data_fixed/' + dist + '_' + m + '1fdr0.05/'
            cookd = False
            analysis_id = str(dist + '_' + m + '1fdr0.05')
            if 'pointwise' not in analysis_id:
                files = sorted(glob.glob(path + '*.txt'))
                # grab most recent log file
                rel_logfile = files[-1]
                if 'jk' in m:
                    method = 'jackknife'
                elif 'bs' in m:
                    method = 'bootstrap'
                else:
                    method = 'cutie'
                with open(rel_logfile, 'r') as f:
                    try: 
                        defaulted, initial_corr, false_corr, true_corr, rs_false, rs_true = parse_log(f, cookd)

                        new_row = pd.DataFrame([[analysis_id, dist_to_short[dist], statistic, method, cookd, 
                                                defaulted, initial_corr, true_corr, false_corr, rs_true, rs_false]], columns=headers)

                        rev_df = rev_df.append(new_row)
                    except: 
                        print(analysis_id)

                
        

sim_copula_n50_norm_0_1_rbsm1fdr0.05
sim_copula_n50_gamma_1_0_100_rbsm1fdr0.05
sim_copula_n50_lognorm_3_0_rbsm1fdr0.05
sim_zero_infl_otu_copula_n50_lognorm_3_0_rbsm1fdr0.05


In [10]:
rev_df.head()

Unnamed: 0,analysis_id,distribution,statistic,method,cookd,defaulted,initial_corr,true_corr(TP_FN),false_corr(FP_TN),rs_true_corr_TP_FN,rs_false_corr_FP_TN
0,sim_copula_n50_norm_0_1_rpc1fdr0.05,norm,pearson,cutie,False,False,68876.0,13694.0,55182.0,14652.0,0.0
0,sim_copula_n50_norm_0_1_rjkp1fdr0.05,norm,pearson,jackknife,False,False,68876.0,30.0,68846.0,14652.0,0.0
0,sim_copula_n50_norm_0_1_rbsp1fdr0.05,norm,pearson,bootstrap,False,False,68876.0,0.0,68876.0,68876.0,0.0
0,sim_copula_n50_gamma_1_0_100_rpc1fdr0.05,gamma,pearson,cutie,False,False,81562.0,21794.0,59768.0,21500.0,14.0
0,sim_copula_n50_gamma_1_0_100_rjkp1fdr0.05,gamma,pearson,jackknife,False,False,81562.0,350.0,81212.0,21514.0,0.0


In [11]:
rev_df.to_csv('../Submissions/CUtIe/final_data_fixed/final_sim_df_rev.txt', sep = '\t', index = False)

In [12]:
header = [
    'Norm / CUTIE',
    'Norm / Jackknife',
    'Norm / Bootstrap',
    'Norm / Cook D',
    'Gamma / CUTIE',
    'Gamma / Jackknife',
    'Gamma / Bootstrap',
    'Gamma / Cook D',
    'LogN / CUTIE',
    'LogN / Jackknife',
    'LogN / Bootstrap',
    'LogN / Cook D',
    'ZI-LogN / CUTIE',
    'ZI-LogN / Jackknife',
    'ZI-LogN / Bootstrap',
    'ZI-LogN / Cook D',
]

index = [
    'Pearson',
    '% initial insig',
    'Spearman',
    '% initial insig',
    'Kendall',
    '% initial insig',
    'MIC',
    '% initial insig',

]
dists = ['norm', 'gamma', 'lognorm', 'zi-lognorm']
method = ['cutie', 'jackknife', 'bootstrap', 'cookd']
df_array = []
for stat in stat_df['statistic']:
    # stat = 'Pearson'
    row_fracs = []
    for dist in dists:
        # dist = 'norm'
        for m in range(len(method)):
            # method[m] = 'cutie'
            # (df["B"] > 50) & (df["C"] == 900)
            row = rev_df[(rev_df['distribution'] == dist) & (rev_df['statistic'] == stat) & (rev_df['method'] == method[m])] 
            row_fracs.append(float(row['true_corr(TP_FN)'] /row['initial_corr'].values)) # correctly id fn
    df_array.append(row_fracs)
    
    initial_sig_fracs = []
    for dist in dists:
         for m in range(len(method)):
            row = rev_df[(rev_df['distribution'] == dist) & (rev_df['statistic'] == stat) & (rev_df['method'] == method[m])] 
            # change number 249500 to n_corr depending on dataset
            initial_sig_fracs.append(float(row['initial_corr'] / 249500))
    df_array.append(initial_sig_fracs)
    
new_df = pd.DataFrame(data = df_array, index = index, columns = header) 
new_df = new_df.rename_axis('Statistic')
#new_df



TypeError: cannot convert the series to <class 'float'>

In [None]:
rev_df.head()

In [None]:
rev_df.to_csv('../Submissions/CUtIe/final_data_fixed/final_sim_df_rev.txt', sep = '\t', index = False)