In [1]:
import pandas as pd

In [2]:
base_path = "/home/colombelli/Documents/arrayexpress/breast/hyb_borda_borda/fold_1/bootstrap_"
individual_rankings = ["gd", "gr", "rf", "su", "wx"]

In [18]:
bs_rankings = {}
for i in range(0,50):
    path = base_path + str(i+1) + "/"
    fs_rankings = []
    
    for fs in individual_rankings:
        fs_rankings.append(pd.read_csv(path+fs+".csv", index_col=0))
    bs_rankings[i] = fs_rankings

In [19]:
"""
    References:
    Ludmila I. Kuncheva. A stability index for feature selection. 
    In Artificial Intelligence and Applications (AIAP’07), pages 390–395, 2007.

"""

def get_consistency_index(A_subset: list, B_subset: list, n: int):
    # The sizes of both subsets must be equal and are hold by variable k
    # And the variable r represents the cardinality of (A ∩ B)

    k = len(A_subset)
    if k != len(B_subset):
        raise Exception('The given A and B subsets have different cardinalities.')
    r = len(list(set(A_subset).intersection(B_subset)))

    consistency_index = ((r * n) - (k * k)) / (k * (n - k))

    return consistency_index


# Provide either the thresholded ranks with the n
# Or a the full ranks with a threshold value
def get_kuncheva_index(subsets: list, n=None, threshold=None):
    # n represents the original set size
    # threshold is an integer representing the number of elements considered
    # the subsets will be selected respecting the closed interval [0, threshold]
    # If a threshold is needed, n isn't needed, otherwise it is
    
    if n is None:
        if threshold is None:
            raise Exception('Provide a threshold value or the size of the original set.')
        else:
            th = threshold
            n = len(subsets[0])
    
    if th > n:
        raise Exception('Provided threshold greater than ranking length (' + str(n) + ')')
    
    elif th == n:
        return 1

    else:
        if threshold is not None:
            th = threshold
        else:
            th = len(subsets[0])

    pairwise_ci = 0
    for i in range(len(subsets) - 1):
        A_subset = subsets[i][:th]
        for j in range(i+1, len(subsets)):
            B_subset = subsets[j][:th]
            pairwise_ci += get_consistency_index(A_subset, B_subset, n)
        

    k = len(subsets)
    kuncheva_index = ((2) / (k * (k-1))) * pairwise_ci

    return kuncheva_index

In [25]:
def get_normalize_stability(stability):

    s_max = 1
    s_min = -1
    normalized = (stability - s_min) / (s_max - s_min) 

    return normalized*2


def get_fs_stabilities(threshold, bs_rankings):
    
    num_fs = len(bs_rankings[0])
    
    stabilities = []
    for fs in range(num_fs):
        
        fs_rankings = []
        for bs in bs_rankings:
            df_ranking = bs_rankings[bs][fs]
            lst_features = list(df_ranking.index.values)
            fs_rankings.append(lst_features)
        
        normalized_stb = get_normalize_stability(get_kuncheva_index(fs_rankings, threshold=threshold))
        stabilities.append(normalized_stb)  
    return stabilities

In [26]:
get_fs_stabilities(42, bs_rankings)

[1.239650676638992,
 1.2837447050224275,
 1.0841724838011095,
 1.5011570896709658,
 1.421733305330132]

In [19]:
import pandas as pd


def build_df_correct_order(aggregated_ranking):
    
    final_ranking = pd.DataFrame.from_dict(aggregated_ranking, orient='index')
    final_ranking.columns = ['rank']
    final_ranking = final_ranking.sort_values(by='rank', ascending=False)
    final_ranking.iloc[:] = final_ranking.iloc[::-1].values
    return final_ranking

In [20]:
ths = [1, 5, 10, 15, 25, 50, 75, 100, 150, 200]
sum(ths) // len(ths)

63

In [22]:
get_fs_stabilities(63, bs_rankings)

[0.4114025608278481, 0.3445164881946492]

In [113]:

def aggregate():
    mean_th = sum(ths) // len(ths)
    fs_stabilities = get_fs_stabilities(mean_th, bs_rankings)
    
    aggregated_ranking = {}


    for bs in bs_rankings:
        
        
        for fs, ranking in enumerate(bs_rankings[bs]):
            reversed_ranking = ranking.iloc[::-1]
            for gene in reversed_ranking.index: 
                aggregated_ranking[gene] += (reversed_ranking.index.get_loc(gene)+1) * fs_stabilities[fs]
    
    
    return build_df_correct_order(aggregated_ranking)