# Implementing ARS

## ACTION 1: Importing libraries

In [None]:
import os
import numpy as np
from math import *
from random import randint
# FOR PLOT
from matplotlib import pyplot

# FOR CHI-SQUARE - MUTUAL INFORMATION - RELIEF
import sklearn_relief as relief

# FOR CROSS FOLD VALIDATION
from sklearn.model_selection import StratifiedKFold, KFold

# FOR TESTING ON LOGISTIC REGRESSION
from sklearn.metrics import accuracy_score

from statistics import mean

# where all utilities are defined
import utils
from utils import *

## ACTION 2: Defining binary functions in functions.py
## ACTION 3: Dataset generation, pair generation and sampling in data_generation.py
## ACTION 4: Defining utilities for chi2 and mutual information in ch2_mi.py
## ACTION 5: Defining baseline logistic regression in baseline.py
## ACTION 6: Defining ARS in ars.py
## ACTION 7: Function to validate ARS stability wrt sample size - order relevance¶

In [None]:
def display_score_latex(dimension,row_name,list_of_score):
    latex_line=""
    for i in range(dimension):
        latex_line+=" & " +str(round(list_of_score[i],2))
    print(row_name+latex_line+"\\\\")
    return True

def test_stability_ars(filename,number_of_test,sample_size_list):
    dataset, X, y, dimension = load_dataset(filename)
    dataset_size= dataset.shape[0]
    positive= np.sum(dataset, axis = 0)[dimension]
    print("****INFORMATION ON INITIAL DATA *******")
    print("dataset:",filename,"size",dataset_size,"dimension:",dimension," - with",positive,"elements in class 1.")
    list_of_attributes=[]
    for i in range(dimension):
        list_of_attributes.append(i)
    for sample_size in sample_size_list:
        mean_ars_scores = [0]*dimension
        for u in range(number_of_test):
            sample_set = generate_sample_set(dataset,sample_size)
            list_of_pairs = all_pairs(sample_set)
            ars_scores = select_features_ars(dimension,list_of_attributes,sample_set,list_of_pairs)
            for j in range(dimension):
                mean_ars_scores[j]+=ars_scores[j]
        mean_ars_scores    = [a*(1/number_of_test) for a in mean_ars_scores]
        display_score_latex(dimension,str(sample_size),mean_ars_scores)
    return True

def test_stability_ars_new(f,dimension,number_of_test,sample_size_list):
    list_of_attributes=[]
    for i in range(dimension):
        list_of_attributes.append(i)
    for sample_size in sample_size_list:
        mean_ars_scores = [0]*dimension
        for u in range(number_of_test):
            create_general_dataset(f,dimension,sample_size)
            filename= "datasets-tested/dataset_general.csv"
            dataset, X, y, dimension = load_dataset(filename)
            #dataset_size= dataset.shape[0]
            #positive= np.sum(dataset, axis = 0)[dimension]
            #print("****INFORMATION ON INITIAL DATA *******")
            #print("dataset:",filename,"size",dataset_size,"dimension:",dimension," - with",positive,"elements in class 1.")
            list_of_pairs = all_pairs(dataset)
            ars_scores = select_features_ars(dimension,list_of_attributes,dataset,list_of_pairs)
            for j in range(dimension):
                mean_ars_scores[j]+=ars_scores[j]
        mean_ars_scores = [a*(1/number_of_test) for a in mean_ars_scores]
        display_score_latex(dimension,str(sample_size),mean_ars_scores)
    return True

## ACTION 8: Validate ars stability

In [None]:
f=g1_array
dimension=15
#size=64
#create_general_dataset(f,dimension,size)
#create_csv_binary_dataset(f)
filename="datasets-tested/dataset_binary.csv"
number_of_test = 10
sample_size_list =[50,100,200,300,400]
#test_stability_ars(filename,number_of_test,sample_size_list)
test_stability_ars_new(f,dimension,number_of_test,sample_size_list)

## ACTION 9: Function to compare ARS - chi-square - mutual information - relief

In [None]:
def compare_score_on_dataset(filename,number_of_test,sample_ratio):
    dataset, X, y, dimension = load_dataset(filename)
    #INFO
    dataset_size=dataset.shape[0]
    print("****INFORMATION ON INITIAL DATA *******")
    print("dataset:",filename,"size:",dataset_size,"dimension:",dimension)

    #create A : list of attribute as index 0, 1, ...
    list_of_attributes=[]
    attribute_names=[]
    for i in range(dimension):
        list_of_attributes.append(i)
        attribute_names.append("a"+str(i+1))
    #print(attribute_names)
    
    sample_size = int(dataset_size*sample_ratio)
    mean_ars_scores    = [0]*dimension
    mean_chi_scores    = [0]*dimension
    mean_mut_scores    = [0]*dimension
    mean_relief_scores = [0]*dimension
    print(sample_size)

    for u in range(number_of_test):
        sample_set = generate_sample_set(dataset,sample_size)
        list_of_pairs = all_pairs(sample_set)
        ars_scores = select_features_ars(dimension,list_of_attributes,sample_set,list_of_pairs)
# NOT SURE WE TEST ON THE SAME SET BECAUSE OF TEST_SIZE PARAM IN PREPARE
        X_train_enc, y_train_enc, X_test_enc=prepare_all(X, y, test_size=sample_ratio, random_state=1)
        X_train_chi, X_test_chi, fs_chi = select_all_features_chi2(X_train_enc, y_train_enc, X_test_enc)
        X_train_mut, X_test_mut, fs_mut = select_all_features_mutual(X_train_enc, y_train_enc, X_test_enc)
    
    # RELIEF
        relief_scores = relief.Relief(n_features=dimension) # we check all attributes
        my_transformed_matrix = relief_scores.fit_transform(X_train_enc,y_train_enc)

# NORMALIZATION FACTORS - All scores are normalized
        Z_ars,Z_chi,Z_mi,Z_relief=0,0,0,0
        for i in range(dimension):
            Z_ars    += ars_scores[i]
            Z_chi    += fs_chi.scores_[i]
            Z_mi     += fs_mut.scores_[i]
            Z_relief += relief_scores.w_[i]
#UPDATE MEAN SCORES BY ADDING NORMALIZED SCORES IN [0,1]  +0.01 to avoid division by 0
        for j in range(dimension):
            mean_ars_scores[j]   += ars_scores[j]/(Z_ars + 0.01)
            mean_chi_scores[j]   += fs_chi.scores_[j]/(Z_chi + 0.01)
            mean_mut_scores[j]   += fs_mut.scores_[j]/(Z_mi + 0.01)
            mean_relief_scores[j]+= relief_scores.w_[j]/(Z_relief + 0.01)

    mean_ars_scores    = [a*(1/number_of_test) for a in mean_ars_scores]
    mean_chi_scores    = [a*(1/number_of_test) for a in mean_chi_scores]
    mean_mut_scores    = [a*(1/number_of_test) for a in mean_mut_scores]
    mean_relief_scores = [a*(1/number_of_test) for a in mean_relief_scores]

    pyplot.title("ARS")
    pyplot.bar(attribute_names, mean_ars_scores)
    pyplot.show()

    pyplot.title("CHI-SQUARE")
    pyplot.bar(attribute_names, mean_chi_scores)
    pyplot.show()

    pyplot.title("MUTUAL INFORMATION")
    pyplot.bar(attribute_names, mean_mut_scores)
    pyplot.show()

    pyplot.title("RELIEF")
    pyplot.bar(attribute_names, mean_relief_scores)
    pyplot.show()
# PREPARE FOR LATEX
    display_score_latex(dimension,"   ars",mean_ars_scores)
    display_score_latex(dimension,"  chi2",mean_chi_scores)
    display_score_latex(dimension,"    mi",mean_mut_scores)
    display_score_latex(dimension,"relief",mean_relief_scores)
    
    return True

## ACTION 10: Comparing score methods

In [None]:
filename="datasets-tested/primary-tumor.data-no_missing.csv"
filename="datasets-tested/1-monks-1.csv"
#filename="datasets-tested/dataset_general.csv"
number_of_test = 2
sample_ratio   = 0.33
compare_score_on_dataset(filename,number_of_test,sample_ratio)

## ACTION 11: Comparing feature relevance score effectiveness on logistic regression

In [None]:
def test_binary_dataset(filename,k): #comparing accuracies by running logistic regression on k best features
    dataset, X, y, dimension = load_dataset(filename)
    print("data shape:",X.shape, "dimension", dimension,"nb of best features:",k)
    FOLDS=2
    list_of_attributes=[]
    for i in range(dimension):
        list_of_attributes.append(i)
    acc_baseline_all_features=baseline_for_binary_with_all(X, y,FOLDS) #10 fold cross valid
    list_of_accuracy_ars=[]
    list_of_accuracy_chi2=[]
    list_of_accuracy_mi=[]
    list_of_accuracy_relief=[]
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True)
    for train, test in skf.split(X,y):
        #print("****************")
        X_train_chi2, X_test_chi2, _ = select_k_features_chi2(X[train], y[train], X[test], k)
        X_train_mi, X_test_mi, _ = select_k_features_mi(X[train], y[train], X[test], k)
        r = relief.Relief(n_features=k)
        X_train_relief = r.fit_transform(X[train], y[train])
        X_test_relief = r.fit_transform(X[test], y[test])              
    # ARS feature selection
        #create A : list of attribute as index 0, 1, ...
        list_of_attributes=[]
        for i in range(dimension):
            list_of_attributes.append(i)
        sample_set = dataset[train]
        list_of_pairs = all_pairs(sample_set)
        ars_scores = select_features_ars(dimension,list_of_attributes,sample_set,list_of_pairs)
        s = numpy.array(ars_scores)
        sort_index = np.argsort(s)
        print(sort_index)
        print(ars_scores)
        #transform the dataset to keep only the k relevant features
        
        a=accuracy(X_train_ars,y[train],X_test_ars,y[test])
        list_of_accuracy_ars.append(a)   
    # CHI2
        a=accuracy(X_train_chi2,y[train],X_test_chi2,y[test])
        list_of_accuracy_chi2.append(a)
        #print("  chi2",a)   
    # MI 
        a=accuracy(X_train_mi,y[train],X_test_mi,y[test])
        list_of_accuracy_mi.append(a)
        #print("    mi",a)
    # RELIEF
        a=accuracy(X_train_relief,y[train],X_test_relief,y[test])
        list_of_accuracy_relief.append(a)
        #print("relief",a)
        
    acc_ars   = mean(list_of_accuracy_ars)
    acc_chi2  = mean(list_of_accuracy_chi2)
    acc_mi    = mean(list_of_accuracy_mi)
    acc_relief= mean(list_of_accuracy_relief)
    return(acc_baseline_all_features,acc_ars,acc_chi2,acc_mi,acc_relief)
    
def test_categorical_dataset(filename,k):  
    dataset, X, y, dimension = load_dataset(filename)
    print("data shape:",X.shape, "dimension", dimension,"nb of used best features:",k)
    FOLDS=10
    list_of_attributes=[]
    for i in range(dimension):
        list_of_attributes.append(i)
    acc_baseline_all_features=baseline_for_categorical_with_all(X,y,FOLDS) #10 fold cross valid
    list_of_accuracy_ars=[]
    list_of_accuracy_chi2=[]
    list_of_accuracy_mi=[]
    list_of_accuracy_relief=[]
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True)
    X_enc=prepare_input(X)
    y_enc=prepare_target(y)
    for train, test in skf.split(X,y):
        X_train_chi2_enc, X_test_chi2_enc, _ = select_k_features_chi2(X_enc[train], y_enc[train], X_enc[test], k)
        X_train_mi_enc, X_test_mi_enc, _ = select_k_features_mi(X_enc[train], y_enc[train], X_enc[test], k)
        r = relief.Relief(n_features=k)
        X_train_relief_enc = r.fit_transform(X_enc[train], y_enc[train])
        X_test_relief_enc = r.fit_transform(X_enc[test], y_enc[test])
    # ARS feature selection
        #create A : list of attribute as index 0, 1, ...
        list_of_attributes=[]
        for i in range(dimension):
            list_of_attributes.append(i)
        sample_set = dataset[train]
        list_of_pairs = all_pairs(sample_set)
        ars_scores = select_features_ars(dimension,list_of_attributes,sample_set,list_of_pairs)
        s = np.array(ars_scores)
        sort_index = np.argsort(s)
        sort_index=np.flipud(sort_index)
        #print("ars index",sort_index)
        #print("ars scores:",ars_scores)
        # transform the dataset to keep only the k relevant features
        X_train_ars_enc = np.delete(X_enc[train], sort_index[0:k],axis=1)
        X_test_ars_enc  = np.delete(X_enc[test],sort_index[0:k],axis=1)
        a=accuracy(X_train_ars_enc,y[train],X_test_ars_enc,y[test])
        list_of_accuracy_ars.append(a)  
    # CHI2 feature selection
        a=accuracy(X_train_chi2_enc,y_enc[train],X_test_chi2_enc,y_enc[test])
        list_of_accuracy_chi2.append(a)
    # MI feature selection
        a=accuracy(X_train_mi_enc,y_enc[train],X_test_mi_enc,y_enc[test])
        list_of_accuracy_mi.append(a)
    # RELIEF
        a=accuracy(X_train_relief_enc,y_enc[train],X_test_relief_enc,y_enc[test])
        list_of_accuracy_relief.append(a)
        
    acc_ars   = mean(list_of_accuracy_ars)
    acc_chi2  = mean(list_of_accuracy_chi2)
    acc_mi    = mean(list_of_accuracy_mi)
    acc_relief= mean(list_of_accuracy_relief)
    return(acc_baseline_all_features,acc_ars,acc_chi2,acc_mi,acc_relief)

In [None]:
data_path="datasets-tested/"
file_to_be_tested=os.listdir(data_path)
file_to_be_tested=["arcene_train.data.csv"]
for filename in file_to_be_tested:
    dataset = read_csv(data_path+filename, header=None)
    data = dataset.values
    dimension=data.shape[1] - 1
    
    k_list= [dimension//5, dimension//4,dimension//3, dimension//2, dimension-1]
    k_list=list(set(k_list))
    print("Tested dataset:",filename,"- dimension:",dimension," - k best dimension tested:",k_list)
    for num in k_list:
        if num>=1:
         acc_baseline_all_features,acc_ars,acc_chi2,acc_mi,acc_relief=test_categorical_dataset(data_path+filename,num)
         print("all:",acc_baseline_all_features," - ars:",acc_ars," - chi2:",acc_chi2," - mi:",acc_mi," - relief:",acc_relief)