<a href="https://colab.research.google.com/github/ferdmartin/LearningLeap/blob/main/Feature_Selection_Filter_%26_Wrapper_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Selection - Filter & Wrapper Methods

# Filter Method

In [None]:
import numpy as np
import pandas as pd
from scipy.io import arff

In [None]:
raw_data = arff.loadarff('veh-prime.arff')
data = pd.DataFrame(raw_data[0])
data["CLASS"] = data["CLASS"].map(lambda x: x.decode("utf-8"))
data["CLASS"].replace({'noncar':0, 'car':1}, inplace=True)
X = data.drop('CLASS', axis=1)
y = data["CLASS"]

In [None]:
def pearson_correlation(x, y):
    N = len(x)
    sum_sq_x = 0
    sum_sq_y = 0
    sum_coproduct = 0
    mean_x = 0
    mean_y = 0

    for i in range(N):
        sum_sq_x += x[i] * x[i]
        sum_sq_y += y[i] * y[i]
        sum_coproduct += x[i] * y[i]
        mean_x += x[i]
        mean_y += y[i]
    mean_x = mean_x / N
    mean_y = mean_y / N
    pop_sd_x = np.sqrt((sum_sq_x/N) - (mean_x * mean_x))
    pop_sd_y = np.sqrt((sum_sq_y/N) - (mean_y * mean_y))
    cov_x_y = (sum_coproduct/N) - (mean_x * mean_y)
    correlation = cov_x_y / (pop_sd_x * pop_sd_y)
    return correlation

In [None]:
def Filter_Method(X, y):
    n_columns = np.shape(X)[1]
    correlations = []
    for i in range(n_columns):
        correlations.append(abs(pearson_correlation(X.iloc[:,i], y)))
    return pd.Series(correlations).sort_values(ascending=False)

r_s = Filter_Method(X, y)
r_s.index = r_s.index.map(lambda x: 'f' + str(x))

## (1) List the features from highest |r| (the absolute value of r) to lowest, along with their |r| values. Why would one be interested in the absolute value of r rather than the raw value?

We are interested in the absolute value since it indicates the relationship strength, and for this case we are not interested in the direction of this relationship, that the raw value provides.

In [None]:
pd.DataFrame(r_s, columns=['Correlation']).sort_values(by='Correlation', ascending=False)

Unnamed: 0,Correlation
f4,0.436922
f13,0.368269
f14,0.368224
f16,0.366025
f7,0.352141
f22,0.35135
f26,0.341043
f1,0.308811
f20,0.299049
f31,0.290783


## (2) Select the features that have the highest m values of |r|, and run LOOCV on the dataset restricted to only those m features. Which value of m gives the highest LOOCV classification accuracy, and what is the value of this optimal accuracy?

m = 20 is the value that gives the highest LOOCV classification accuracy, that results in 0.950355.

In [None]:
# Defining KNN function
def KNN(X_train, y_train, X_test,k=3):
    # Euclidean function
    euclidean_distance = np.sum((X_test - X_train)**2,axis=1)**(1/2)
    euclidean_distance = euclidean_distance.rename("Euclidean_distance")
    # Determine highest voted class
    df = pd.merge(y_train,euclidean_distance, left_index=True, right_index=True).sort_values("Euclidean_distance").iloc[:k]
    df["Count"] = 1
    df = df.groupby(y_train.name).sum().sort_values("Count", ascending=False)
    max_count = max(df.Count)
    # If two or more winning classes receive the same number of votes, we break the tie using the lowest total distance
    if len(df[df["Count"] == max_count]) > 1:
        df = df[df["Count"] == max_count].sort_values("Euclidean_distance")
        prediction = df.sort_values("Euclidean_distance").index[0]
    else:
        prediction = df.index[0]
    return prediction

In [None]:
def FilterMethod(X,y, r_s, k_fold): # Extract best lambda
    Ms = {}
    columns = []
    for M, column in enumerate(r_s.index):
        columns.append(column)
        n = len(X)
        i = 0
        j = i+(n//k_fold)
        results_for_accuracy = []

        while j <= n: # Cross-validation
            indexes_to_be_removed = X.iloc[i:j].index
            X_train_CV = X.drop(indexes_to_be_removed,axis=0)
            y_train_CV = y.drop(indexes_to_be_removed,axis=0)
            X_test_CV = X.iloc[i:j]
            y_test_CV = y.iloc[i:j]

            predictions = KNN(X_train_CV[columns], y_train_CV, X_test_CV[columns].values, k=7)

            if predictions == y_test_CV.values[0]:
                results_for_accuracy.append(1)
            else:
                results_for_accuracy.append(0)
            i=j
            j+= (n//k_fold)

        accuracy = np.average(results_for_accuracy)
        Ms[M] = accuracy
    Ms = pd.Series(Ms)
    Ms.index = Ms.index.map(lambda x: x+1)
    return Ms

In [None]:
cv_results = FilterMethod(X, y, r_s, len(X))
cv_results[cv_results.values == cv_results.max()]

20    0.950355
dtype: float64

# Q4: Wrapper Method

## (1) Show the set of selected features at each step, as it grows from size zero to its final size (increasing in size by exactly one feature at each step).

In [None]:
def WrapperMethod(X, y, k_fold):
    model_acc = 0
    model_columns = []
    for _ in X.columns:

        if model_columns:
            X_model = X.drop(model_columns,axis=1)
        else:
            X_model = X.copy()

        Ms = {}
        df_columns = list(X_model.columns)
        for column in df_columns:
            n = len(X)
            i = 0
            j = i+(n//k_fold)
            results_for_accuracy = []

            while j <= n: # Cross-validation
                indexes_to_be_removed = X.iloc[i:j].index
                X_train_CV1 = X_model[column].drop(indexes_to_be_removed,axis=0); X_train_CV2 = X[model_columns].drop(indexes_to_be_removed,axis=0)
                X_train_CV = pd.concat([X_train_CV1, X_train_CV2],axis=1)
                y_train_CV = y.drop(indexes_to_be_removed,axis=0)
                X_test_CV1 = X_model[column].iloc[i:j]; X_test_CV2 = X[model_columns].iloc[i:j]
                X_test_CV = pd.concat([X_test_CV1, X_test_CV2],axis=1)
                y_test_CV = y.iloc[i:j]

                predictions = KNN(X_train_CV, y_train_CV, X_test_CV.values, k=7)

                if predictions == y_test_CV.values[0]:
                    results_for_accuracy.append(1)
                else:
                    results_for_accuracy.append(0)
                i=j
                j+= (n//k_fold)
            accuracy = np.average(results_for_accuracy)
            Ms[column] = accuracy
        Ms = pd.Series(Ms)
        Ms = Ms[Ms.values == Ms.max()]

        if Ms.values[0] > model_acc:
            new_column = Ms.index[0]
            model_acc = Ms.values[0]
            model_columns.append(new_column)
            df_columns.remove(new_column)
            print(f'Columns: {model_columns}\t{round(model_acc,6)}')

        else:
            break

In [None]:
WrapperMethod(X, y, len(X))

Columns: ['f20']	0.754137
Columns: ['f20', 'f10']	0.84279
Columns: ['f20', 'f10', 'f19']	0.888889
Columns: ['f20', 'f10', 'f19', 'f8']	0.916076
Columns: ['f20', 'f10', 'f19', 'f8', 'f7']	0.93617
Columns: ['f20', 'f10', 'f19', 'f8', 'f7', 'f14']	0.951537
Columns: ['f20', 'f10', 'f19', 'f8', 'f7', 'f14', 'f2']	0.952719
Columns: ['f20', 'f10', 'f19', 'f8', 'f7', 'f14', 'f2', 'f1']	0.959811
Columns: ['f20', 'f10', 'f19', 'f8', 'f7', 'f14', 'f2', 'f1', 'f16']	0.965721


## (2) What is the LOOCV accuracy over the final set of selected features?

Using Columns: ['f20', 'f10', 'f19', 'f8', 'f7', 'f14', 'f2', 'f1', 'f16'], we get an accuracy of 0.965721.