In [55]:
import pandas as pd
from scipy.stats import kendalltau, spearmanr, rankdata
from scipy.spatial.distance import canberra

import os
from tqdm import tqdm

In [69]:
ranks_files = {
    "ReliefF": "rf.csv",
    "GeoDE": "gd.csv",
    "GR": "gr.csv",
    "SU": "su.csv",
    "Wx": "wx.csv"
}


def get_similarity_corr(path, correlation_function):
    ranks = {}
    corr_df = {}
    p_value_df = {}
    idx_ref = pd.read_csv(path+"gd.csv", index_col=0).index
    
    for key, value in ranks_files.items():
        corr_df[key] = []
        p_value_df[key] = []
        ranks[key] = rankdata(pd.read_csv(path+value, index_col=0).loc[idx_ref, 'rank'])
        
    for key in corr_df.keys():
        for key_pair, rank_pair in ranks.items():
            c, p = correlation_function(ranks[key], rank_pair)
            corr_df[key].append(c)
            p_value_df[key].append(p)
       
    cdf = pd.DataFrame(corr_df)
    pdf = pd.DataFrame(p_value_df)
    cdf.index = list(ranks.keys())
    pdf.index = list(ranks.keys())
    
    return cdf, pdf


def get_cd_similarity_corr(path, correlation_function):
    ranks = {}
    df = {}
    idx_ref = pd.read_csv(path+"gd.csv", index_col=0).index
    
    for key, value in ranks_files.items():
        df[key] = []
        ranks[key] = rankdata(pd.read_csv(path+value, index_col=0).loc[idx_ref, 'rank'])
        
    for key in df.keys():
        for key_pair, rank_pair in ranks.items():
            try:
                c, p = correlation_function(ranks[key], rank_pair)
                df[key].append((c,p))
            except:
                metric = correlation_function(ranks[key], rank_pair)
                df[key].append(metric)
       
    dataframe = pd.DataFrame(df)
    dataframe.index = list(ranks.keys())
    return dataframe

    
def get_matrixes(read_from, correlation_function):
    bs_paths = [read_from+"bootstrap_"+str(i)+"/" for i in range(1,51)]
    matrixes_c = []
    matrixes_p = []
    for path in bs_paths:
        cdf, pdf = get_similarity_corr(path, correlation_function)
        matrixes_c.append(cdf)
        matrixes_p.append(pdf)
        
    return matrixes_c, matrixes_p


def get_dp_matrixes(read_from, correlation_function):
    bs_paths = [read_from+"bootstrap_"+str(i)+"/" for i in range(1,51)]
    matrixes = []
    for path in bs_paths:
        matrixes.append(get_cd_similarity_corr(path, correlation_function))
        
    return matrixes


def get_canberra_matrixes(read_from):
    bs_paths = [read_from+"bootstrap_"+str(i)+"/" for i in range(1,51)]
    matrixes = []
    for path in bs_paths:
        matrixes.append(get_cd_similarity_corr(path, canberra))
        
    return matrixes
    

def save_matrixes(saving_path, matrixes1, matrixes2=None, sufix="f1_bs"):
    tuple_matrixes=True
    if matrixes2 != None:
        tuple_matrixes=False
        os.mkdir(saving_path+"double_matrixes")
        path = saving_path+"double_matrixes/"
    else: 
        os.mkdir(saving_path+"tuple_matrixes")
        path = saving_path+"tuple_matrixes/"

    if tuple_matrixes:
        for i, m in enumerate(tqdm(matrixes1)):
            m.to_csv(path+sufix+"bs_"+str(i+1)+".csv")
        
        #mean_matrix = pd.concat(matrixes1).groupby(level=0).mean()
        #mean_matrix.to_csv(path+mean)
        
    else:
        for i, m in enumerate(tqdm(matrixes1)):
            m.to_csv(path+"corr_"+sufix+str(i+1)+".csv")
            matrixes2[i].to_csv(path+"pval_"+sufix+str(i+1)+".csv")
        
        path+="mean/"
        os.mkdir(path)
        mean_matrix1 = pd.concat(matrixes1).groupby(level=0).mean().loc[list(ranks_files.keys()), :]
        mean_matrix1.to_csv(path+"mean_corr_"+sufix[:-3]+".csv")
        mean_matrix2 = pd.concat(matrixes2).groupby(level=0).mean().loc[list(ranks_files.keys()), :]
        mean_matrix2.to_csv(path+"mean_pval_"+sufix[:-3]+".csv")
    return


def save_matrixes_canberra(path, matrixes, sufix="f1_bs"): 
    for i, m in enumerate(tqdm(matrixes)):
        m.to_csv(path+sufix+"bs_"+str(i+1)+".csv")

    path+="mean/"
    os.mkdir(path)
    mean_matrix = pd.concat(matrixes).groupby(level=0).mean().loc[list(ranks_files.keys()), :]
    mean_matrix.to_csv(path+"mean.csv")
    return
        

def execute_all(read_from, save_to, correlation_function, fold=1):
    path=read_from+"fold_"+str(fold)+"/"
    print("Executing for double matrixes...")
    corr, pval = get_matrixes(path, correlation_function)
    save_matrixes(save_to, corr, pval, sufix="f"+str(fold)+"_bs")
    
    print("\nExecuting for tuple matrixes...")
    matrixes = get_dp_matrixes(path, correlation_function)
    save_matrixes(save_to, matrixes, sufix="f"+str(fold)+"_bs")
    return

def execute_all_canberr(read_from, save_to, fold=1):
    path=read_from+"fold_"+str(fold)+"/"
    print("Executing for canberra matrixes...")
    matrixes = get_canberra_matrixes(path)
    save_matrixes_canberra(save_to, matrixes, sufix="f"+str(fold)+"_bs")
    

In [70]:
read_from = "/home/colombelli/Documents/pmlb_efs_experiments/results/hyb_sonar_E3/"

print("Calculating similarities with Spearman Rank Correlation...")
save_to = "/home/colombelli/Documents/pmlb_efs_experiments/similarities/sonar_spearman/"
execute_all(read_from, save_to, spearmanr, fold=1)

print("\n\nCalculating similarities with Kendall Tau Rank Correlation...")
save_to = "/home/colombelli/Documents/pmlb_efs_experiments/similarities/sonar_kendalltau/"
execute_all(read_from, save_to, kendalltau, fold=1)


print("\n\nCalculating similarities with Canberra distance...")
save_to = "/home/colombelli/Documents/pmlb_efs_experiments/similarities/sonar_canberra/"
execute_all_canberr(read_from, save_to, fold=1)

Calculating similarities with Spearman Rank Correlation...
Executing for double matrixes...


100%|██████████| 50/50 [00:00<00:00, 527.40it/s]



Executing for tuple matrixes...


100%|██████████| 50/50 [00:00<00:00, 959.96it/s]




Calculating similarities with Kendall Tau Rank Correlation...
Executing for double matrixes...


100%|██████████| 50/50 [00:00<00:00, 316.35it/s]



Executing for tuple matrixes...


100%|██████████| 50/50 [00:00<00:00, 757.58it/s]




Calculating similarities with Canberra distance...
Executing for canberra matrixes...


100%|██████████| 50/50 [00:00<00:00, 1309.10it/s]
