# Implementing ARI

## ACTION 1: Importing libraries and define static variables

In [None]:
import os
import numpy as np
from math import *
from random import randint
# FOR PLOT
from matplotlib import pyplot

# FOR CHI-SQUARE - MUTUAL INFORMATION - RELIEF
import sklearn_relief as relief

# FOR CROSS FOLD VALIDATION
from sklearn.model_selection import StratifiedKFold, KFold

# FOR TESTING ON LOGISTIC REGRESSION
from sklearn.metrics import accuracy_score

from statistics import mean

# where all utilities are defined
import utils
from utils import *

DATASET_GENERAL = "datasets-tested/dataset_general.csv"

## ACTION 2: Defining binary functions in functions.py
## ACTION 3: Dataset generation, pair generation and sampling in data_generation.py
## ACTION 4: Defining utilities for chi2 and mutual information in ch2_mi.py
## ACTION 5: Defining baseline logistic regression in baseline.py
## ACTION 6: Defining ARI in ari.py
## ACTION 7: Function to validate ARI stability wrt sample size - order relevance¶

In [None]:
def display_score_latex(dimension,row_name,ari_scores_means,ari_scores_std):
    latex_line=""
    for i in range(dimension):
        latex_line+=" & "+str(round(ari_scores_means[i],2))+"-"+str(round(ari_scores_std[i],2))
    print("sample "+row_name+latex_line+"\\\\")
    return True

def get_ari(dataset,dimension,sample_size):  #return a list of score per feature
    list_of_attributes=[]
    for i in range(dimension):   # index of attributes
        list_of_attributes.append(i)    
    ari_scores = [0]*dimension   # ari is all 0 to start with
    list_of_pairs = all_pairs(dataset)
    ari_scores = select_features_ars(dimension,list_of_attributes,dataset,list_of_pairs)
    return ari_scores

## ACTION 8: Validate ARI stability

In [None]:
'''
for sample_size in sample_size_list:
    mean_ari_scores = [0]*dimension
    feature_score_list=[]
    for i in range(dimension): #create a list of list of scores per feature
        feature_score_list.append([])
    for u in range(number_of_test):
        #uncomment the line below when running ari with test_stability_ari
        create_categorical_dataset(filename,f,dimension,sample_size,categorical_range)
        dataset, X, y, dimension = load_dataset(filename)
        ari_scores = get_ari(dataset,dimension,sample_size)
        for j in range(dimension): #accumulate the scores
            mean_ari_scores[j]+=ari_scores[j]
            feature_score_list[j].append(ari_scores[j])
    #print(len(feature_score_list[0]))       
  #  mean_ari_scores = [a*(1/number_of_test) for a in mean_ari_scores]
    ari_scores_means = [mean(feature_score_list[j]) for j in range(dimension)]
    ari_scores_std  = [np.array(feature_score_list[j]).std() for j in range(dimension)]
    #print(ari_scores_std)
    display_score_latex(dimension,str(sample_size),ari_scores_means,ari_scores_std) 
'''    
    
def validate_stability(dimension,categorical_range,f,number_of_test,sample_size_list):
    for sample_size in sample_size_list:
        mean_ari_scores = [0]*dimension
        feature_score_list=[]
        for i in range(dimension): #create a list of list of scores per feature
            feature_score_list.append([])
        for u in range(number_of_test):
            create_categorical_dataset(filename,f,dimension,sample_size,categorical_range)
            dataset, X, y, dimension = load_dataset(filename)
            ari_scores = get_ari(dataset,dimension,sample_size)
            for j in range(dimension): #accumulate the scores
                mean_ari_scores[j]+=ari_scores[j]
                feature_score_list[j].append(ari_scores[j])
        ari_scores_means = [mean(feature_score_list[j]) for j in range(dimension)]
        ari_scores_std  = [np.array(feature_score_list[j]).std() for j in range(dimension)]
        display_score_latex(dimension,str(sample_size),ari_scores_means,ari_scores_std)
    return

dimension = 10
categorical_range = 1 #1 means binary - range of values for feature
#f = g1_array

size_of_X=int(pow((categorical_range+1),dimension))

number_of_test = 1
sample_size_list = [300,400,500]
sample_size_list = [50]

filename=DATASET_GENERAL
print("dimension:",dimension, "- size of X:",size_of_X, " - number of test:",number_of_test) 

list_of_functions=[g1_array,g2_array,g3_array,g4_array,g5_array,g6_array,g7_array,g8_array]
list_of_functions=[g1_array]
for f in list_of_functions:
    print("function:",str(f))
    validate_stability(dimension,categorical_range,f,number_of_test,sample_size_list)    
    

## ACTION 9-old: Function to compare ARS - chi-square - mutual information - relief

In [None]:
#old
NO_OF_FS = 4
ARI=0
CHI=1
MI=2
RELIEF=3
Z_EPSILON=0.001 # values added to denominator to avoid division by 0

def _old_compare_score_on_synthetic_dataset(filename,number_of_test):
    
    fs =[
     {"latex": "   ari", "chart": "ARI"},
     {"latex": "  chi2", "chart": "CHI SQUARE"},
     {"latex": "    mi", "chart": "MUTUAL INFORMATION"},
     {"latex": "relief", "chart": "RELIEF"}]

       
    dataset, X, y, dimension = load_dataset(filename)
    #INFO
    dataset_size=dataset.shape[0]
    print("****INFORMATION ON INITIAL DATA *******")
    print("dataset:",filename,"size:",dataset_size,"dimension:",dimension, "number of test:", number_of_test)
    
    feature_score_list_ari = []
    for i in range(dimension): #create a list of list of scores per feature
        feature_score_list_ari.append([])
                
    #list of attribute as indices: 0, 1, ...
    list_of_attributes=[]
    attribute_names=[]
    for i in range(dimension):
        list_of_attributes.append(i)
        attribute_names.append("a"+str(i+1))
    #print(attribute_names)
    
    sample_size = dataset_size
    print(sample_size)

    #create a 3D array to store the score for each feature, each dimension of the data, and each test
    feature_score_list = np.empty((NO_OF_FS, dimension, number_of_test))    
    
    '''
    feature_score_list = [[foo for i in range(10)] for j in range(10)]
    for i in range(NO_OF_FS): #create a list to store scores for each type
        feature_score_list.append([])
        for j in range(dimension): #create a list to store scores per feature
    '''    

    mean_ari_scores    = [0]*dimension
    mean_chi_scores    = [0]*dimension
    mean_mut_scores    = [0]*dimension
    mean_relief_scores = [0]*dimension
    
    
#FOR EACH TEST
    for u in range(number_of_test):
        sample_set = generate_sample_set(dataset,sample_size)
        list_of_pairs = all_pairs(sample_set)
        #print(len(list_of_pairs))
        ari_scores = select_features_ars(dimension,list_of_attributes,sample_set,list_of_pairs)
        #print(ars_scores)

        # NOT SURE WE TEST ON THE SAME SET BECAUSE OF TEST_SIZE PARAM
        #SLIM - 21/06 - removed TEST_SIZE PARAM
        
        fs_chi = calculate_chi2_scores(X, y)
        fs_mut = calculate_mi_scores(X, y)
    
    # RELIEF
        relief_scores = relief.Relief(n_features=dimension) # we check all attributes
        my_transformed_matrix = relief_scores.fit_transform(X,y)

# NORMALIZATION FACTORS - All scores are normalized +0.001 to avoid division by 0
        Z_ari    = sum(ari_scores) + 0.001
        Z_chi    = sum(fs_chi.scores_) + 0.001
        Z_mi     = sum(fs_mut.scores_) + 0.001
        Z_relief = sum(relief_scores.w_) + 0.001

#STORE EACH SCORE 
        #store the score for CHI for all dimensions in test number u
        feature_score_list[ARI,:,u:u+1] = np.reshape(ari_scores, [dimension,1])/ (sum(ari_scores) + Z_EPSILON)
        feature_score_list[CHI,:,u:u+1] = np.reshape(fs_chi.scores_, [dimension,1])/ (sum(fs_chi.scores_) + Z_EPSILON)
        feature_score_list[MI,:,u:u+1]  = np.reshape(fs_mut.scores_, [dimension,1])/ (sum(fs_mut.scores_) + Z_EPSILON)      
        feature_score_list[RELIEF,:,u:u+1]  = np.reshape(relief_scores.w_, [dimension,1])/ (sum(relief_scores.w_) + Z_EPSILON)      
        
#UPDATE MEAN SCORES BY ADDING NORMALIZED SCORES IN [0,1]  
        for j in range(dimension):
            mean_ari_scores[j]   += ari_scores[j]/(Z_ari)
            mean_chi_scores[j]   += fs_chi.scores_[j]/(Z_chi)            
            mean_mut_scores[j]   += fs_mut.scores_[j]/(Z_mi)
            mean_relief_scores[j]+= relief_scores.w_[j]/(Z_relief)
            feature_score_list_ari[j].append(ari_scores[j]/Z_ari)            

    mean_ari_scores    = [a*(1/number_of_test) for a in mean_ari_scores]
    mean_chi_scores    = [a*(1/number_of_test) for a in mean_chi_scores]
    mean_mut_scores    = [a*(1/number_of_test) for a in mean_mut_scores]
    mean_relief_scores = [a*(1/number_of_test) for a in mean_relief_scores]
    
    ari_scores_std  = [np.array(feature_score_list_ari[j]).std() for j in range(dimension)] 
    

    pyplot.title("ARI")
    pyplot.bar(attribute_names, mean_ari_scores)
    pyplot.show()

    pyplot.title("CHI-SQUARE")
    pyplot.bar(attribute_names, mean_chi_scores)
    pyplot.show()

    pyplot.title("MUTUAL INFORMATION")
    pyplot.bar(attribute_names, mean_mut_scores)
    pyplot.show()

    pyplot.title("RELIEF")
    pyplot.bar(attribute_names, mean_relief_scores)
    pyplot.show()
    
    mean = np.mean(feature_score_list, axis=2)
    std  = np.std(feature_score_list, axis=2)
    
    for i in range(len(fs)):
        pyplot.title(fs[i]['chart'])
        pyplot.bar(attribute_names, mean[i])
        pyplot.show()
    
    
# PREPARE FOR LATEX

    #display_score_latex(dimension,str(sample_size),ari_scores_means,ari_scores_std)
    display_score_latex(dimension,"     ari",mean_ari_scores,ari_scores_std)
    display_score_latex(dimension,"   npari",mean[ARI],std[ARI])
    
    display_score_latex(dimension,"    chi2",mean_chi_scores,std[CHI])
    display_score_latex(dimension,"  npchi2",mean[CHI],std[CHI])
    
    display_score_latex(dimension,"      mi",mean_mut_scores,std[MI])
    display_score_latex(dimension,"    npmi",mean[MI],std[MI])
    
    display_score_latex(dimension,"  relief",mean_relief_scores, std[RELIEF])
    display_score_latex(dimension,"nprelief",mean[RELIEF],std[RELIEF])
    
    print("\n")
    
    for i in range(len(fs)):
        display_score_latex(dimension, fs[i]['latex'] ,mean[i],std[i])
        
    
    return True

## ACTION 9: Function to compare ARS - chi-square - mutual information - relief

In [None]:
NO_OF_FS = 4 # no of feature selection to be tested
ARI=0    # the indexes for the four features
CHI=1
MI=2
RELIEF=3
Z_EPSILON=0.001 # values added to denominator to avoid division by 0

def compare_score_on_synthetic_dataset(filename,number_of_test):
       
    # the titles of the feature selections for the outputs in latex and chart
    fs =[
     {"latex": "   ari", "chart": "ARI"},
     {"latex": "  chi2", "chart": "CHI SQUARE"},
     {"latex": "    mi", "chart": "MUTUAL INFORMATION"},
     {"latex": "relief", "chart": "RELIEF"}]
       
    dataset, X, y, dimension = load_dataset(filename)
    #INFO
    dataset_size=dataset.shape[0]
    print("****INFORMATION ON INITIAL DATA *******")
    print("dataset:",filename,"size:",dataset_size,"dimension:",dimension, "number of test:", number_of_test)
    
    #list of attribute as indices: 0, 1, ...
    list_of_attributes=[]
    attribute_names=[]
    for i in range(dimension):
        list_of_attributes.append(i)
        attribute_names.append("a"+str(i+1))
    #print(attribute_names)
    
    sample_size = dataset_size
    print(sample_size)

    #create a 3D array to store the score for each feature, each dimension of the data, and each test
    feature_score_list = np.empty((NO_OF_FS, dimension, number_of_test))    
    
#FOR EACH TEST
    for u in range(number_of_test):
        sample_set = generate_sample_set(dataset,sample_size)
        list_of_pairs = all_pairs(sample_set)
        #print(len(list_of_pairs))
        ari_scores = select_features_ars(dimension,list_of_attributes,sample_set,list_of_pairs)
        #print(ars_scores)

        # NOT SURE WE TEST ON THE SAME SET BECAUSE OF TEST_SIZE PARAM
        #SLIM - 21/06 - removed TEST_SIZE PARAM
        fs_chi = calculate_chi2_scores(X, y)
        fs_mut = calculate_mi_scores(X, y)
    
    # RELIEF
        relief_scores = relief.Relief(n_features=dimension) # we check all attributes
        my_transformed_matrix = relief_scores.fit_transform(X,y)

#STORE EACH SCORE 
        #store the score for each feature selection techniques for all dimensions for each test (u)
        # NORMALIZATION FACTORS - All scores are normalized +0.001 to avoid division by 0
        feature_score_list[ARI,:,u:u+1] = np.reshape(ari_scores, [dimension,1])/ (sum(ari_scores) + Z_EPSILON)
        feature_score_list[CHI,:,u:u+1] = np.reshape(fs_chi.scores_, [dimension,1])/ (sum(fs_chi.scores_) + Z_EPSILON)
        feature_score_list[MI,:,u:u+1]  = np.reshape(fs_mut.scores_, [dimension,1])/ (sum(fs_mut.scores_) + Z_EPSILON)      
        feature_score_list[RELIEF,:,u:u+1]  = np.reshape(relief_scores.w_, [dimension,1])/ (sum(relief_scores.w_) + Z_EPSILON)      
        
    mean = np.mean(feature_score_list, axis=2)
    std  = np.std(feature_score_list, axis=2)    

#DISPLAY THE BAR CHART PLOTS
    for i in range(len(fs)):
        pyplot.title(fs[i]['chart'])
        pyplot.bar(attribute_names, mean[i])
        pyplot.show()    
    
# PREPARE FOR LATEX      
    for i in range(len(fs)):
        display_score_latex(dimension, fs[i]['latex'] ,mean[i],std[i])
        
    print("\n")
    
    return True

## ACTION 10: Comparing score methods - synthetic data

In [None]:
#filename="datasets-tested/primary-tumor.data-no_missing.csv"
#filename="datasets-tested/1-monks-1.csv"
filename="datasets-tested/dataset_general.csv"
sample_size = 50
dimension = 10
categorical_range = 3

number_of_test = 2
#sample_ratio   = 0.8  #THERE IS AN ISSUE HERE AS I DO NOT UNDERSTAND TRAIN/TEST SAMPLE_RATIO
list_of_functions=[g1_array,g2_array,g3_array,g4_array,g5_array,g6_array,g7_array]
list_of_functions=[g1_array]
for f in list_of_functions:
    print("comparing score - synthetic data - function:",str(f))   
    create_categorical_dataset(filename,g1_array,dimension,sample_size,categorical_range)
    compare_score_on_synthetic_dataset(filename,number_of_test)
    

## ACTION 11: Comparing feature relevance score effectiveness on logistic regression

In [None]:
def test_binary_dataset(filename,k): #comparing accuracies by running logistic regression on k best features
    dataset, X, y, dimension = load_dataset(filename)
    print("data shape:",X.shape, "dimension", dimension,"nb of best features:",k)
    FOLDS=2
    list_of_attributes=[]
    for i in range(dimension):
        list_of_attributes.append(i)
    acc_baseline_all_features=baseline_for_binary_with_all(X, y,FOLDS) #10 fold cross valid
    list_of_accuracy_ars=[]
    list_of_accuracy_chi2=[]
    list_of_accuracy_mi=[]
    list_of_accuracy_relief=[]
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True)
    for train, test in skf.split(X,y):
        #print("****************")
        X_train_chi2, X_test_chi2, _ = select_k_features_chi2(X[train], y[train], X[test], k)
        X_train_mi, X_test_mi, _ = select_k_features_mi(X[train], y[train], X[test], k)
        r = relief.Relief(n_features=k)
        X_train_relief = r.fit_transform(X[train], y[train])
        X_test_relief = r.fit_transform(X[test], y[test])              
    # ARS feature selection
        #create A : list of attribute as index 0, 1, ...
        list_of_attributes=[]
        for i in range(dimension):
            list_of_attributes.append(i)
        sample_set = dataset[train]
        list_of_pairs = all_pairs(sample_set)
        ars_scores = select_features_ars(dimension,list_of_attributes,sample_set,list_of_pairs)
        s = numpy.array(ars_scores)
        sort_index = np.argsort(s)
        print(sort_index)
        print(ars_scores)
        #transform the dataset to keep only the k relevant features
        
        a=accuracy(X_train_ars,y[train],X_test_ars,y[test])
        list_of_accuracy_ars.append(a)   
    # CHI2
        a=accuracy(X_train_chi2,y[train],X_test_chi2,y[test])
        list_of_accuracy_chi2.append(a)
        #print("  chi2",a)   
    # MI 
        a=accuracy(X_train_mi,y[train],X_test_mi,y[test])
        list_of_accuracy_mi.append(a)
        #print("    mi",a)
    # RELIEF
        a=accuracy(X_train_relief,y[train],X_test_relief,y[test])
        list_of_accuracy_relief.append(a)
        #print("relief",a)
        
    acc_ars   = mean(list_of_accuracy_ars)
    acc_chi2  = mean(list_of_accuracy_chi2)
    acc_mi    = mean(list_of_accuracy_mi)
    acc_relief= mean(list_of_accuracy_relief)
    return(acc_baseline_all_features,acc_ars,acc_chi2,acc_mi,acc_relief)
    
def test_categorical_dataset(filename,k):  
    dataset, X, y, dimension = load_dataset(filename)
    print("data shape:",X.shape, "dimension", dimension,"nb of used best features:",k)
    FOLDS=10
    list_of_attributes=[]
    for i in range(dimension):
        list_of_attributes.append(i)
    acc_baseline_all_features=baseline_for_categorical_with_all(X,y,FOLDS) #10 fold cross valid
    list_of_accuracy_ars=[]
    list_of_accuracy_chi2=[]
    list_of_accuracy_mi=[]
    list_of_accuracy_relief=[]
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True)
    X_enc=prepare_input(X)
    y_enc=prepare_target(y)
    for train, test in skf.split(X,y):
        X_train_chi2_enc, X_test_chi2_enc, _ = select_k_features_chi2(X_enc[train], y_enc[train], X_enc[test], k)
        X_train_mi_enc, X_test_mi_enc, _ = select_k_features_mi(X_enc[train], y_enc[train], X_enc[test], k)
        r = relief.Relief(n_features=k)
        X_train_relief_enc = r.fit_transform(X_enc[train], y_enc[train])
        X_test_relief_enc = r.fit_transform(X_enc[test], y_enc[test])
    # ARS feature selection
        #create A : list of attribute as index 0, 1, ...
        list_of_attributes=[]
        for i in range(dimension):
            list_of_attributes.append(i)
        sample_set = dataset[train]
        list_of_pairs = all_pairs(sample_set)
        ars_scores = select_features_ars(dimension,list_of_attributes,sample_set,list_of_pairs)
        s = np.array(ars_scores)
        sort_index = np.argsort(s)
        sort_index=np.flipud(sort_index)
        #print("ars index",sort_index)
        #print("ars scores:",ars_scores)
        # transform the dataset to keep only the k relevant features
        X_train_ars_enc = np.delete(X_enc[train], sort_index[0:k],axis=1)
        X_test_ars_enc  = np.delete(X_enc[test],sort_index[0:k],axis=1)
        a=accuracy(X_train_ars_enc,y[train],X_test_ars_enc,y[test])
        list_of_accuracy_ars.append(a)  
    # CHI2 feature selection
        a=accuracy(X_train_chi2_enc,y_enc[train],X_test_chi2_enc,y_enc[test])
        list_of_accuracy_chi2.append(a)
    # MI feature selection
        a=accuracy(X_train_mi_enc,y_enc[train],X_test_mi_enc,y_enc[test])
        list_of_accuracy_mi.append(a)
    # RELIEF
        a=accuracy(X_train_relief_enc,y_enc[train],X_test_relief_enc,y_enc[test])
        list_of_accuracy_relief.append(a)
        
    acc_ars   = mean(list_of_accuracy_ars)
    acc_chi2  = mean(list_of_accuracy_chi2)
    acc_mi    = mean(list_of_accuracy_mi)
    acc_relief= mean(list_of_accuracy_relief)
    return(acc_baseline_all_features,acc_ars,acc_chi2,acc_mi,acc_relief)