In [None]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

# Remove Redundant Variables


When tackling supervised learning problems, one can find in the dataset variables that are redundant, i.e. have the same or almost the same information, tthan others in the dataset. These variables are not helpful for the model predictions and are usually called **redundant variables**.

In order to reduce computational and memory costs, as well as to avoid the issue of the model given too much importante to this redundant information, it is advisable to remove them from the dataset.

## Load Data

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import io
dat = pd.read_csv(io.BytesIO(uploaded['mtcars.csv']), sep = ";")
dat.head()

Unnamed: 0,user,booking_date,origin_airport,price,sales channel,ant,airline
0,user5,01/11/2018,MAD,,online,,i2
1,user7,01/11/2018,DUB,147.5,online,38.0,i2
2,user4,02/11/2018,TFS,24.049999,online,19.0,i2
3,user8,29/10/2018,MAD,59.709999,online,8.0,i2
4,user7,01/11/2018,,37.299999,call center,4.0,i2


## What is a Redundant Variable?

There are several ways of understanding what a redundant variable is. Nevertheless, usually the criteria to mark a variable as redundant is based on that variable surpassing a maximum threshold of correlation with respect to another variable in the dataset.

## Detect Redundant Variables

Based on our previous definition, we need first to compute correlations of each variable with respect to the others.


In [None]:
cor = dat.corr(numeric_only = TRUE)
cor

Unnamed: 0,price,ant
price,1.0,-0.012836
ant,-0.012836,1.0


Finally, let's define a maximum correlation threshold and detect the variables that fall above it.

In [None]:
threshold = 0.99
cor = np.abs(cor) > threshold
cor

Unnamed: 0,price,ant,modified_price
price,True,False,True
ant,False,True,False
modified_price,True,False,True


In [None]:
redundant_vars = []
for c in cor.columns.values:
    if (c in cor.columns.values):
        # Get correlation for that variable
        cor_values = cor[c]

        # Do not take into account correlation with itself
        cor_values.values[cor_values.index == c] = False

        # Get redundant variables with respect to this one
        redundant_columns = cor.index[cor_values.values].tolist()

        # Drop redundant variables from analysis
        cor = cor.drop(redundant_columns, axis = 1)

        # Add redundant variables to final list
        redundant_vars.extend(redundant_columns)

redundant_vars

['modified_price']

Putting all together.

In [None]:
cor = dat.corr()
cor = np.abs(cor) > threshold
redundant_vars = []
for c in cor.columns.values:
    if (c in cor.columns.values):
        # Get correlation for that variable
        cor_values = cor[c]

        # Do not take into account correlation with itself
        cor_values.values[cor_values.index == c] = False

        # Get redundant variables with respect to this one
        redundant_columns = cor.index[cor_values.values].tolist()

        # Drop redundant variables from analysis
        cor = cor.drop(redundant_columns, axis = 1)

        # Add redundant variables to final list
        redundant_vars.extend(redundant_columns)

redundant_vars

['modified_price']

## Define Custom Function

In [None]:
def redundant_variables(X, threshold = 0.99):
    cor = dat.corr()
    cor = np.abs(cor) > threshold
    redundant_vars = []
    for c in cor.columns.values:
        if (c in cor.columns.values):
            # Get correlation for that variable
            cor_values = cor[c]

            # Do not take into account correlation with itself
            cor_values.values[cor_values.index == c] = False

            # Get redundant variables with respect to this one
            redundant_columns = cor.index[cor_values.values].tolist()

            # Drop redundant variables from analysis
            cor = cor.drop(redundant_columns, axis = 1)

            # Add redundant variables to final list
            redundant_vars.extend(redundant_columns)
    X = X.drop(redundant_vars, axis = 1)
    print('Variables ' + str(redundant_vars) + ' have been removed from dataset.')
    return X

In [None]:
dat_new = redundant_variables(dat)
dat_new

Variables ['modified_price'] have been removed from dataset.


Unnamed: 0,user,booking_date,origin_airport,price,sales channel,ant,airline
0,user5,01/11/2018,MAD,,online,,i2
1,user7,01/11/2018,DUB,147.500000,online,38.0,i2
2,user4,02/11/2018,TFS,24.049999,online,19.0,i2
3,user8,29/10/2018,MAD,59.709999,online,8.0,i2
4,user7,01/11/2018,,37.299999,call center,4.0,i2
...,...,...,...,...,...,...,...
995,user2,01/11/2018,JMK,,online,29.0,i2
996,user10,01/11/2018,SVQ,,online,39.0,i2
997,user4,30/10/2018,MAD,,online,5.0,i2
998,user10,02/11/2018,CDG,,online,4.0,i2


In [None]:
dat_new = redundant_variables(dat, threshold = 0.01)
dat_new

Variables ['ant', 'modified_price'] have been removed from dataset.


Unnamed: 0,user,booking_date,origin_airport,price,sales channel,airline
0,user5,01/11/2018,MAD,,online,i2
1,user7,01/11/2018,DUB,147.500000,online,i2
2,user4,02/11/2018,TFS,24.049999,online,i2
3,user8,29/10/2018,MAD,59.709999,online,i2
4,user7,01/11/2018,,37.299999,call center,i2
...,...,...,...,...,...,...
995,user2,01/11/2018,JMK,,online,i2
996,user10,01/11/2018,SVQ,,online,i2
997,user4,30/10/2018,MAD,,online,i2
998,user10,02/11/2018,CDG,,online,i2
