# Implementing FRI
Author: Gilles Richard
Date  : 2023
The program for the candidate paper "FEATURE RELEVANCE INDEX"


## ACTION 1: Importing libraries

## This code is an implementation of FRI, supposed to overcome the curse of dimensionnality faced by ARI.

In [None]:
# pip3 install matplotlib pandas sklearn statistics sklearn_relief
import numpy as np
from math import *
from random import randint
import matplotlib.pyplot as plt
from pandas import read_csv

# FOR CHI-SQUARE - MUTUAL INFORMATION - RELIEF
from sklearn.feature_selection import chi2,mutual_info_classif
import sklearn_relief as relief
from skrebate import ReliefF
from sklearn.preprocessing import OneHotEncoder

# FOR CROSS FOLD VALIDATION
from sklearn.model_selection import StratifiedKFold

# FOR TESTING ON LOGISTIC REGRESSION
from sklearn.metrics import accuracy_score

from statistics import mean

# where all utilities are defined
from utils import *

## ACTION 2: Dataset generation, pair generation and sampling in data_generation.py
## ACTION 3: Baseline logistic regression, decision trees, random forests in baseline.py
## ACTION 4: FRI utilities in fri.py
## ACTION 5: Display functions for latex

In [None]:
def display_k_best_feature(dimension,row_name,feature_list,k):
    latex_line=""
    for i in range(k):
        latex_line+=" & $"+feature_list[i]+"$"
    print(row_name+latex_line+ "\\\\")
    return True

def display_accuracy(accuracy_list):
    latex_line="accuracy & $ fri_1 & fri_2 & fri_3 & fri_4 & chi & mi & relief \\\\"
    for i in range(len(accuracy_list)):
        latex_line+=" & $"+str(accuracy_list[i])+"$"
    print(latex_line+ "\\\\")
    return True

## ACTION 6: Features rank for FRI - chi2 - mutual information

In [None]:
def all_features_rank(filename,number_of_random_sample):
    dataframe = read_csv(filename,header=None,dtype="int")
    dimension=dataframe.shape[1]-1
    size=dataframe.shape[0]
    k_fri= int(dimension/2)  #FROM DATASET GET K VALUE  LET'S TRY 2 
    if size > 1000:
        ratio=0.05
    else:
        ratio=0.3
    for i in range(number_of_random_sample): #creating number_of_test proper csv files
        datatemp = dataframe.sample(frac=ratio) 
        datatemp.to_csv("tests/sample_"+str(i+1)+".csv",index=False,header=False) 
        
    mean_fri_scores_1  = [0]*dimension
    mean_fri_scores_2  = [0]*dimension
    mean_fri_scores_3  = [0]*dimension
    mean_fri_scores_4  = [0]*dimension
    
    mean_chi_scores    = [0]*dimension
    mean_mi_scores     = [0]*dimension
    #mean_relief_scores = [0]*dimension
    
    for u in range(number_of_random_sample): 
        sample_file="tests/sample_"+str(u+1)+".csv"
        sample_set, X, y, _= load_dataset_string(sample_file)
        set_of_pairs = all_pairs(sample_set)
        fri_scores_1,_  = get_fri_scores(dimension,set_of_pairs,k_fri)
        fri_scores_2,_  = get_fri_scores(dimension,set_of_pairs,k_fri+3)
        fri_scores_3,_  = get_fri_scores(dimension,set_of_pairs,k_fri+10)
        fri_scores_4,_  = get_fri_scores(dimension,set_of_pairs,dimension-2)  #ADDED FOR SMALL SAMPLE
        X=X.astype(int)
        chi_scores, _ = chi2(X, y)
        mi_scores = mutual_info_classif(X,y,discrete_features='auto',n_neighbors=3,copy=True,random_state=None)
       # relief_scores = relief.Relief(n_features=dimension) # we check all attributes
       # my_transformed_matrix = relief_scores.fit_transform(X,y)
        #relief_scores=ReliefF(n_neighbors=sample_size,n_features_to_select=10,n_jobs=-1)
        #relief_scores.fit(X,y)
    
#ACCUMULATE SCORES - NO NEED TO NORMALIZE
        for j in range(dimension):
            mean_fri_scores_1[j]  += fri_scores_1[j]
            mean_fri_scores_2[j]  += fri_scores_2[j]
            mean_fri_scores_3[j]  += fri_scores_3[j]
            mean_fri_scores_4[j]  += fri_scores_4[j]
            
            mean_chi_scores[j]    += chi_scores[j]
            mean_mi_scores[j]     += mi_scores[j]
            #mean_relief_scores[j] += relief_scores.w_[j]

# GETTING AVERAGE SCORES
    mean_fri_scores_1   = [a*(1/number_of_random_sample) for a in mean_fri_scores_1]
    mean_fri_scores_2   = [a*(1/number_of_random_sample) for a in mean_fri_scores_2]
    mean_fri_scores_3   = [a*(1/number_of_random_sample) for a in mean_fri_scores_3]
    mean_fri_scores_4   = [a*(1/number_of_random_sample) for a in mean_fri_scores_4]
    
    mean_chi_scores     = [a*(1/number_of_random_sample) for a in mean_chi_scores]
    mean_mi_scores      = [a*(1/number_of_random_sample) for a in mean_mi_scores]
    #mean_relief_scores  = [a*(1/number_of_random_sample) for a in mean_relief_scores]

 # DISPLAYING FEATURES RANK
    bf_fri_1  = feature_index_rank(mean_fri_scores_1)
    bf_fri_2  = feature_index_rank(mean_fri_scores_2)
    bf_fri_3  = feature_index_rank(mean_fri_scores_3)
    bf_fri_4  = feature_index_rank(mean_fri_scores_4)
    bf_chi    = feature_index_rank(mean_chi_scores)
    bf_mi     = feature_index_rank(mean_mi_scores)
    #bf_relief = feature_index_rank(mean_relief_scores)
    return bf_fri_1,bf_fri_2,bf_fri_3,bf_fri_4, bf_chi,bf_mi #,bf_relief

## ACTION 7: Comparing feature score effectiveness on baseline

In [None]:
#DATASETS TO BE TESTED (in overleaf table order)
dataset_1="real-datasets/covid_ready.csv"
dataset_2="real-datasets/bootcamp/bootcamp2016_ready.csv"
dataset_3="real-datasets/audio_ready.csv"
dataset_4="real-datasets/splice_ready.csv"
dataset_5="real-datasets/student_ready.csv"
dataset_6="real-datasets/bike_ready.csv"

#GET THE RANK OF FEATURES FOR ALL 7 METHODS
#list_of_datasets=[dataset_1,dataset_2,dataset_3,dataset_4,dataset_5,dataset_6,]
list_of_datasets=[dataset_1]
list_of_methods=["fri_1","fri_2","fri_3","fri_4","chi","mi"] #,"relief"
number_of_random_sample=10 #TO COMPUTE AVERAGE SCORES FOR EACH METHOD
for dataset in list_of_datasets:
    clean_folder("tests/")
    #INFO ON DATASET
    size,dimension,number_of_class=get_info(dataset)
    folds=5  #Be careful because of stratified ... if we have more than 5 classes
    if size > 500:
        folds=10
    print("dataset_name:",dataset,"dimension:", dimension,"-- size:",size,"--number_of_class:",number_of_class,"--cross_valid:",folds)
    bf_fri_1,bf_fri_2,bf_fri_3,bf_fri_4, bf_chi,bf_mi=all_features_rank(dataset,number_of_random_sample)
    list_of_ranked_features=[bf_fri_1,bf_fri_2,bf_fri_3,bf_fri_4, bf_chi,bf_mi]
    for i in list_of_ranked_features:
        print(i)
    clean_folder("tests/")
#MAIN LOOP
    for number_to_remove in [int(dimension/3),int(dimension/2),dimension-10,dimension-4,dimension-2,dimension-1]:
        print("********************* dataset"+dataset+"*********************")
        print("** dimension:",dimension," - n0 features removed:",number_to_remove," - n0 features kept:",dimension-number_to_remove,"**")
        accuracy_list_lr,accuracy_list_dt, accuracy_list_rf=[],[],[]
        for i in range(len(list_of_methods)): 
            new_file_name="tests/removedFeatures_"+str(number_to_remove)+"-"+list_of_methods[i]+".csv"
            if number_to_remove==0:
                columns_to_remove=[]
            else:    
                columns_to_remove= list_of_ranked_features[i][-number_to_remove:]
            remove_column(dataset,new_file_name,columns_to_remove)
            data, X, y, _=load_dataset(new_file_name)
        #acc=baseline_lr_accuracy(X, y,folds_cross_valid)
        #accuracy_list_lr.append(round(acc,2))
        #acc=baseline_dt_accuracy(X, y,folds_cross_valid)
        #accuracy_list_dt.append(round(acc,2))
            acc=baseline_rf_accuracy(X, y,folds)
            accuracy_list_rf.append(round(acc,2))
        #display_accuracy(accuracy_list_lr)
        #display_accuracy(accuracy_list_dt)
        display_accuracy(accuracy_list_rf)