### Feature Selection 

Dataset: Ad-data_prep.csv

Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [None]:
# Set Options for display
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:.2f}'.format

Load the dataset

In [None]:
df = pd.read_csv("../datasets/Ad-data_prep.csv", index_col = 0)

In [None]:
df.head()

In [None]:
df.shape

Separate Features from Target

In [None]:
features = df.drop(["clicks","clicks_log"], axis=1)
target = df[["clicks","clicks_log"]]

Feature Selection: Low Variance Filter

In [None]:
# Compute the variance and sort
features.var().sort_values()[:60]

In [None]:
# Import the VarianceThreshold Function
from sklearn.feature_selection import VarianceThreshold

In [None]:
# Instantiate the Function and Set the Threshold
selector = VarianceThreshold(0.01)

In [None]:
# Apply the Function to filter out the Low Variance Columns/Features

filtered_data = selector.fit_transform(features)


filtered_features = pd.DataFrame(filtered_data, index=features.index)

In [None]:
features.shape

In [None]:
filtered_data.shape

In [None]:
# Note, the DataFrame Created has no Column Names
filtered_features.head()

In [None]:
# Use this attribute to get Column Names
selected = selector.get_support()

# Rename the columns
filtered_features.columns = features.columns[selected]

In [None]:
features.columns

In [None]:
filtered_features.head()

In [None]:
filtered_features.var().sort_values().head()

In [None]:
# Compare previous vs current number of Features
print("No. of Features (Original): %i" %len(features.columns))
print("No. of Features (Variance Filter): %i" %len(filtered_features.columns))

Feature Selection: High Correlation (Multicollinearity) Filter

Correlation Filter Function

In [None]:
# Create correlation matrix
corr_matrix = filtered_features.corr().abs()

In [None]:
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype('bool'))

In [None]:
# Set the threshold
threshold = 0.90

In [None]:
# Find index of feature columns with correlation greater than threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

In [None]:
# View Features to Drop
to_drop

In [None]:
len(to_drop)

In [None]:
# Drop Features
filtered_features_2 = filtered_features.drop(to_drop, axis=1)

In [None]:
# Compare previous vs current number of Features
print("No. of Features (Original): %i" %len(features.columns))
print("No. of Features (Variance Filter): %i" %len(filtered_features.columns))
print("No. of Features (Correlation Filter): %i" %len(filtered_features_2.columns))

Custom Function

In [None]:
def correlation_filter(df,threshold):
    # Create correlation matrix
    corr_matrix = df.corr().abs()
    
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype('bool'))
    
    # Find index of feature columns with correlation greater than threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    return to_drop

In [None]:
# Test the function
function_test = correlation_filter(filtered_features,threshold)

In [None]:
len(function_test)

In [None]:
function_test

Verification

In [None]:
# Print out top correlated features

# Reshape the Matrix
correlated = corr_matrix.unstack()

# Reset Index from Multi-index to single index
correlated = correlated.reset_index(level=0).reset_index()

# Rename Columns
correlated.columns = ["Feature1", "Feature2", "Correlation"]

# Sort by Correlation Value
corr_sorted = correlated.sort_values("Correlation", ascending=False)

In [None]:
correlated.head()

In [None]:
# Remove Self-Correlated Rows
corr_sorted_pairs = corr_sorted[corr_sorted['Feature1'].values != corr_sorted['Feature2'].values]

In [None]:
# Reset Index to easily filter out duplicate rows
corr_sorted_pairs.reset_index(drop=True,inplace=True)

In [None]:
# Skip Odd Rows to Retain only 1 pair combination
corr_sorted_final = corr_sorted_pairs.iloc[::2]

In [None]:
corr_sorted_final.Feature1.nunique()

In [None]:
# Select rows based on threshold
corr_sorted_final_ver = corr_sorted_final[corr_sorted_final.Correlation > threshold]

In [None]:
corr_sorted_final_ver

In [None]:
to_drop

View the final file and save to csv

In [None]:
# Compare previous vs current number of Features
print("No. of Features (Original): %i" %len(features.columns))
print("No. of Features (Variance Filter): %i" %len(filtered_features.columns))
print("No. of Features (Correlation Filter): %i" %len(filtered_features_2.columns))

In [None]:
df_final = filtered_features_2.join(target)

In [None]:
df_final.shape

In [None]:
df_final.head()