_Supervised learning for classification_

Feature selection is the process of selecting a subset of relevant features from the original set of features to improve the performance of a supervised learning classification model.

### 2.1 Load the previous results

In [None]:
import os
import pickle
import pandas as pd
import plotly.express as px
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# Load data and labels
with open("outputs/01_Variables.pkl", 'rb') as file:
    data, labels = pickle.load(file)
data = data.transpose()
y = labels["Group"]
# Preview data
print("Data shape:", data.shape)
print("Labels shape:", labels.shape)
print("y shape:", y.shape)

In [None]:
def PCA_3D(input_data, title, labels):
    data_viz = input_data.transpose()
    data_long = pd.melt(data_viz, id_vars=data_viz.columns[0], var_name='sample', value_name='mVal')
    data_long = pd.merge(data_long, labels, left_on='sample', right_on="sample")
    pca = PCA(n_components=3)
    pca_data = pca.fit_transform(data_viz.transpose())
    pca_data = pd.DataFrame(pca_data)
    pca_data.columns = ["PC 1", "PC 2", "PC 3"]
    pca_data["sample"] = data_viz.columns
    pca_data = pd.merge(pca_data, labels, left_on='sample', right_on="label")
    fig = px.scatter_3d(pca_data, x='PC 1', y='PC 2', z='PC 3', color='Group',
                        hover_name="label", title=title)
    fig.show()

### 2.2 Removing features with low variance

VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesnâ€™t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples.

In [None]:
# Define the threshold of the proportion of zeros
percentage_cf = 0.8
selector = VarianceThreshold(threshold=(percentage_cf * (1 - percentage_cf)))
filtered_data = selector.fit_transform(data, y)
cols_idxs = selector.get_support(indices=True)
filtered_data = data.iloc[:,cols_idxs]
print("Original shape: \t"+str(data.transpose().shape))
print("After filtering:\t"+str(filtered_data.shape))
print("Removed features:\t"+str(data.transpose().shape[1]-filtered_data.shape[1]))

In [None]:
PCA_3D(input_data=filtered_data,  labels=labels,
       title="Removing features with low variance")

In [None]:
# If you want to accept this feature selection, please replace data variable
data = filtered_data

### 2.3 Univariate feature selection

Univariate feature selection is a type of feature selection method used in machine learning and statistics. It aims to select the most relevant features from a dataset based on their individual relationship with the target variable, without considering the interactions or dependencies between features.

In univariate feature selection, each feature is evaluated independently and assigned a score or ranking based on its relationship with the target variable. The scores are then used to select the top-k features that exhibit the strongest relationship with the target.

In [None]:
# k = 2000  # Number of top features to select
k = int(data.shape[1] * 0.1)  # Number of top features to select
univariate_selector = SelectKBest(score_func=f_classif, k=k)
filtered_data_array = univariate_selector.fit_transform(data, y)
# Get the selected feature indices
selected_features_indices = univariate_selector.get_support(indices=True)
# Convert the filtered data back to a DataFrame
filtered_data = pd.DataFrame(filtered_data_array, index=data.index, columns=data.columns[selected_features_indices])
print(f"Shape after univariate feature selection: {filtered_data.shape}")

In [None]:
PCA_3D(input_data=filtered_data,  labels=labels,
       title="Univariate feature selection")

In [None]:
# If you want to accept this feature selection, please replace data variable
data = filtered_data

### 2.4 Feature Selection using Mutual Information

Mutual Information measures the dependency between variables. It can capture non-linear relationships between features and the target variable.


In [None]:
# k = 1000  # Number of top features to select
k = int(data.shape[1] * 0.5)  # Number of top features to select
mi_selector = SelectKBest(score_func=mutual_info_classif, k=k)
filtered_data_array = mi_selector.fit_transform(data, y)
# Get the selected feature indices
selected_features_indices = mi_selector.get_support(indices=True)
# Convert the filtered data back to a DataFrame
filtered_data = pd.DataFrame(filtered_data_array, index=data.index, columns=data.columns[selected_features_indices])
print(f"Shape after mutual information selection: {filtered_data.shape}")

In [None]:
PCA_3D(input_data=filtered_data,  labels=labels,
       title="Mutual information selection")

In [None]:
# If you want to accept this feature selection, please replace data variable
data = filtered_data

### 2.5 Recursive Feature Elimination (RFE)

In [None]:
k = 50
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
model = RandomForestClassifier(n_estimators=100, random_state=42)
rfe_selector = RFE(estimator=model, n_features_to_select=k, step=50)
filtered_data_array = rfe_selector.fit_transform(data_scaled, labels)
# Get the selected feature indices
selected_features_indices = rfe_selector.get_support(indices=True)
# Convert the filtered data back to a DataFrame
filtered_data = pd.DataFrame(filtered_data_array, index=data.index, columns=data.columns[selected_features_indices])
print(f"Shape after RFE selection: {filtered_data.shape}")

In [None]:
PCA_3D(input_data=filtered_data,  labels=labels,
       title="RFE selection")

In [None]:
# Update data if you want to use this feature selection
data = filtered_data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib import colormaps
from matplotlib.colors import to_hex

colormap = colormaps.get_cmap('tab20')  # Get the colormap without specifying the number of colors
colors = [to_hex(colormap(i)) for i in np.linspace(0, 1, len(labels["Group"].unique()))]
lut = dict(zip(labels["Group"].unique(), colors))
col_colors = labels["Group"].map(lut)
col_colors.index = labels["label"]
plt.figure(figsize=(10, 8), dpi=600)
g = sns.clustermap(data.transpose(), annot=False, cmap='coolwarm', col_colors=col_colors,
                   xticklabels=True, yticklabels=True)
# Adjust the font size of the x and y tick labels
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), fontsize=8)  # For x labels
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), fontsize=8)  # For y labels
plt.show()

In [None]:
# Extract the feature importances from the model after RFE
# The model has been trained on all features, so we need to retrain it on the selected features
model.fit(filtered_data, labels)
feature_importances = model.feature_importances_

# Sort the features by importance
sorted_indices = np.argsort(feature_importances)[::-1]
sorted_features = filtered_data.columns[sorted_indices]
sorted_importances = feature_importances[sorted_indices]
n = 100
# Plot the feature importances
plt.figure(figsize=(10, 8))
plt.title(f"Top {min(n, k)} Feature Importances after RFE")
plt.barh(sorted_features[:n], sorted_importances[:n], color="royalblue")
plt.gca().invert_yaxis()  # Highest importance at the top
plt.xlabel("Feature Importance")
plt.ylabel("Feature Name")
plt.yticks(fontsize=5)
plt.show()

### 2.6 Save variables for next steps

In [None]:
directory = 'outputs'
if not os.path.exists(directory):
    os.makedirs(directory)
# Save variables to a file
with open('outputs/02_Variables.pkl', 'wb') as file:
    pickle.dump((data, labels, y), file)