# Implementing ARI

## ACTION 1: Importing libraries and define static variables

In [1]:
import os
import numpy as np
from math import *
from random import randint
# FOR PLOT
from matplotlib import pyplot

# FOR CHI-SQUARE - MUTUAL INFORMATION - RELIEF
import sklearn_relief as relief

# FOR CROSS FOLD VALIDATION
from sklearn.model_selection import StratifiedKFold, KFold

# FOR TESTING ON LOGISTIC REGRESSION
from sklearn.metrics import accuracy_score

from statistics import mean

# where all utilities are defined
import utils
from utils import *

## ACTION 2: Defining binary functions in functions.py
## ACTION 3: Dataset generation, pair generation and sampling in data_generation.py
## ACTION 4: Defining utilities for chi2 and mutual information in ch2_mi.py
## ACTION 5: Defining baseline logistic regression in baseline.py
## ACTION 6: Defining ARI in ari.py
## ACTION 7: Function to validate ARI stability wrt sample size - order relevance¶

In [2]:
def display_score_latex(dimension,row_name,scores_means,scores_std):
    latex_line=""
    for i in range(dimension):
        latex_line+=" & "+str(round(scores_means[i],2))+"-"+str(round(scores_std[i],2))
    print(row_name+latex_line+"\\\\")
    return True

def get_ari(dataset,dimension,sample_size):  #return a list of score per feature
    list_of_attributes=[]
    for i in range(dimension):   # index of attributes
        list_of_attributes.append(i)    
    ari_scores = [0]*dimension   # ari is all 0 to start with
    list_of_pairs = all_pairs(dataset)
    ari_scores = select_features_ars(dimension,list_of_attributes,dataset,list_of_pairs)
    return ari_scores

## ACTION 9: Function to compare ARI - chi-square - mutual information - relief

In [3]:
def compare_score_on_dataset(filename,number_of_test,sample_ratio):
    dataset, X, y, dimension = load_dataset(filename)
    #INFO
    dataset_size=dataset.shape[0]
    print("****INFORMATION ON INITIAL DATA *******")
    print("dataset:",filename,"size:",dataset_size,"dimension:",dimension)

    #list of attribute as indices: 0, 1, ...
    list_of_attributes=[]
    attribute_names=[]
    for i in range(dimension):
        list_of_attributes.append(i)
        attribute_names.append("a"+str(i+1))
    #print(attribute_names)
    
    sample_size = int(dataset_size*sample_ratio)
    print(sample_size)
    mean_ari_scores    = [0]*dimension
    mean_chi_scores    = [0]*dimension
    mean_mut_scores    = [0]*dimension
    mean_relief_scores = [0]*dimension
    #print(sample_size)

    for u in range(number_of_test):
        sample_set = generate_sample_set(dataset,sample_size)
        list_of_pairs = all_pairs(sample_set)
        #print(len(list_of_pairs))
        ari_scores = select_features_ars(dimension,list_of_attributes,sample_set,list_of_pairs)
        #print(ars_scores)
# NOT SURE WE TEST ON THE SAME SET BECAUSE OF TEST_SIZE PARAM
        X_train_enc, y_train_enc, X_test_enc=prepare_all(X, y, test_size=sample_ratio, random_state=1)
        X_train_chi, X_test_chi, fs_chi = select_all_features_chi2(X_train_enc, y_train_enc, X_test_enc)
        X_train_mut, X_test_mut, fs_mut = select_all_features_mutual(X_train_enc, y_train_enc, X_test_enc)
    
    # RELIEF
        relief_scores = relief.Relief(n_features=dimension) # we check all attributes
        my_transformed_matrix = relief_scores.fit_transform(X_train_enc,y_train_enc)

# NORMALIZATION FACTORS - All scores are normalized +0.001 to avoid division by 0
        Z_ari    = sum(ari_scores) + 0.001
        Z_chi    = sum(fs_chi.scores_) + 0.001
        Z_mi     = sum(fs_mut.scores_) + 0.001
        Z_relief = sum(relief_scores.w_) + 0.001
        
#UPDATE MEAN SCORES BY ADDING NORMALIZED SCORES IN [0,1]  
        for j in range(dimension):
            mean_ari_scores[j]   += ari_scores[j]/(Z_ari)
            mean_chi_scores[j]   += fs_chi.scores_[j]/(Z_chi)
            mean_mut_scores[j]   += fs_mut.scores_[j]/(Z_mi)
            mean_relief_scores[j]+= relief_scores.w_[j]/(Z_relief)

    mean_ari_scores    = [a*(1/number_of_test) for a in mean_ari_scores]
    mean_chi_scores    = [a*(1/number_of_test) for a in mean_chi_scores]
    mean_mut_scores    = [a*(1/number_of_test) for a in mean_mut_scores]
    mean_relief_scores = [a*(1/number_of_test) for a in mean_relief_scores]
    
    ari_scores_std  = [np.array(mean_ari_scores[j]).std() for j in range(dimension)]
    chi_scores_std  = [np.array(mean_chi_scores[j]).std() for j in range(dimension)]
    mut_scores_std  = [np.array(mean_mut_scores[j]).std() for j in range(dimension)]
    relief_scores_std  = [np.array(mean_relief_scores[j]).std() for j in range(dimension)]

    pyplot.title("ARI")
    pyplot.bar(attribute_names, mean_ari_scores)
    pyplot.show()

    pyplot.title("CHI-SQUARE")
    pyplot.bar(attribute_names, mean_chi_scores)
    pyplot.show()

    pyplot.title("MUTUAL INFORMATION")
    pyplot.bar(attribute_names, mean_mut_scores)
    pyplot.show()

    pyplot.title("RELIEF")
    pyplot.bar(attribute_names, mean_relief_scores)
    pyplot.show()
# PREPARE FOR LATEX
    display_score_latex(dimension,"   ari",mean_ari_scores,ari_scores_std)
    display_score_latex(dimension,"  chi2",mean_chi_scores,chi_scores_std)
    display_score_latex(dimension,"    mi",mean_mut_scores,mut_scores_std)
    display_score_latex(dimension,"relief",mean_relief_scores,relief_scores_std)
    
    return True

## ACTION 10: Comparing feature score methods values

In [4]:
number_of_test = 10
sample_ratio   = 0.8   #THERE IS AN ISSUE HERE AS I DO NOT UNDERSTAND TRAIN/TEST SAMPLE_RATIO
number_of_folds=10
tested_folder="datasets-tested/"
for filename in os.listdir(tested_folder):
    print(filename)
    if filename==".DS_Store":
        continue
    #compare_score_on_dataset(tested_folder+filename,number_of_test,sample_ratio)
    #dataset, X, y, dimension = load_dataset(tested_folder+filename)
    #accuracy=baseline_for_categorical_with_all(X, y,number_of_folds)
    #print(filename,"- accuracy baseline:",round(accuracy,2))

6-primary-tumor.data-no_missing.csv
1-monks-1.csv
.DS_Store
2-monks-2.csv
4-breast-cancer.csv
7-mushroom.csv
5-hiv.csv
3-monks-3.csv


## ACTION 11: Comparing feature relevance score effectiveness on logistic regression

In [5]:
def test_categorical_dataset(filename,k):  
    dataset, X, y, dimension = load_dataset(filename)
    print("data shape:",X.shape, "dimension", dimension,"nb of used best features:",k)
    FOLDS=10
    list_of_attributes=[]
    for i in range(dimension):
        list_of_attributes.append(i)
    acc_baseline_all_features=baseline_for_categorical_with_all(X,y,FOLDS) #10 fold cross valid
    list_of_accuracy_ars=[]
    list_of_accuracy_chi2=[]
    list_of_accuracy_mi=[]
    list_of_accuracy_relief=[]
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True)
    X_enc=prepare_input(X)
    y_enc=prepare_target(y)
    for train, test in skf.split(X,y):
        X_train_chi2_enc, X_test_chi2_enc, _ = select_k_features_chi2(X_enc[train], y_enc[train], X_enc[test], k)
        X_train_mi_enc, X_test_mi_enc, _ = select_k_features_mi(X_enc[train], y_enc[train], X_enc[test], k)
        r = relief.Relief(n_features=k)
        X_train_relief_enc = r.fit_transform(X_enc[train], y_enc[train])
        X_test_relief_enc  = r.fit_transform(X_enc[test], y_enc[test])
    # ARS feature selection
        #create A : list of attribute as index 0, 1, ...
        list_of_attributes=[]
        for i in range(dimension):
            list_of_attributes.append(i)
        sample_set = dataset[train]
        list_of_pairs = all_pairs(sample_set)
        ars_scores = select_features_ars(dimension,list_of_attributes,sample_set,list_of_pairs)
        s = np.array(ars_scores)
        sort_index = np.argsort(s)
        sort_index=np.flipud(sort_index)
        #print("ars index",sort_index)
        #print("ars scores:",ars_scores)
        # transform the dataset to keep only the k relevant features if there is 0 relevance
        # k should be redefined for ARI
        #print(sort_index[0:k],s[sort_index[0:k][k-1]])
        if s[sort_index[0:k][k-1]]==0:
            k=k-1   #removing the last guy if relevance 0 like in monks1
        X_train_ars_enc = np.delete(X_enc[train], sort_index[0:k],axis=1)
        X_test_ars_enc  = np.delete(X_enc[test],sort_index[0:k],axis=1)
        a=accuracy(X_train_ars_enc,y[train],X_test_ars_enc,y[test])
        list_of_accuracy_ars.append(a)  
    # CHI2 feature selection
        a=accuracy(X_train_chi2_enc,y_enc[train],X_test_chi2_enc,y_enc[test])
        list_of_accuracy_chi2.append(a)
    # MI feature selection
        a=accuracy(X_train_mi_enc,y_enc[train],X_test_mi_enc,y_enc[test])
        list_of_accuracy_mi.append(a)
    # RELIEF
        a=accuracy(X_train_relief_enc,y_enc[train],X_test_relief_enc,y_enc[test])
        list_of_accuracy_relief.append(a)
        
    acc_ars   = mean(list_of_accuracy_ars)
    acc_chi2  = mean(list_of_accuracy_chi2)
    acc_mi    = mean(list_of_accuracy_mi)
    acc_relief= mean(list_of_accuracy_relief)
    
    return(acc_baseline_all_features,acc_ars,acc_chi2,acc_mi,acc_relief)

In [6]:
tested_folder="datasets-refactor/Monks2/"
FOLDS=10
for filename in sorted((os.listdir(tested_folder))):
    if filename==".DS_Store":
        continue
    dataset, X, y, dimension = load_dataset(tested_folder+filename)
    print(filename,"- data shape:",X.shape, "dimension", dimension)
    
    #print("Tested dataset:",filename,"- dimension:",dimension," - k best dimension tested:",k_list)
    print(baseline_for_categorical_with_all(X, y,FOLDS))
    

monks2-ari.csv - data shape: (601, 4) dimension 4
0.6572404371584699
monks2-chi2.csv - data shape: (601, 4) dimension 4
0.6539071038251366
monks2-mi.csv - data shape: (601, 4) dimension 4
0.6572404371584699
monks2-relief.csv - data shape: (601, 4) dimension 4
0.6572404371584699
