In [1]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

# Remove Irrelevant Variables


When tackling supervised learning problems, one can find in the dataset variables that are not informative or related to the target we want to predict. These variables are not helpful for the model predictions and are usually called **irrelevant variables**.

In order to reduce computational and memory costs, as well as to decrease noise in the data, it is advisable to remove them from the dataset.

## Load Data

In [3]:
dat = pd.read_csv('../data/mtcars.csv', sep = ",")
dat.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## What is an Irrelevant Variable?

There are several ways of understanding what an irrelevant variable is. Nevertheless, normally the criteria to mark a variable as irrelevant is based on that variable not achieving a minimum threshold of correlation with the target we want to predict.



## Detect Irrelevant Variables

Based on our previous definition, we need first to compute correlations of each variable with respect to the target.


In [4]:
cor = dat.corr(numeric_only = True)
cor

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
mpg,1.0,-0.852162,-0.847551,-0.776168,0.681172,-0.867659,0.418684,0.664039,0.599832,0.480285,-0.550925
cyl,-0.852162,1.0,0.902033,0.832447,-0.699938,0.782496,-0.591242,-0.810812,-0.522607,-0.492687,0.526988
disp,-0.847551,0.902033,1.0,0.790949,-0.710214,0.88798,-0.433698,-0.710416,-0.591227,-0.555569,0.394977
hp,-0.776168,0.832447,0.790949,1.0,-0.448759,0.658748,-0.708223,-0.723097,-0.243204,-0.125704,0.749812
drat,0.681172,-0.699938,-0.710214,-0.448759,1.0,-0.712441,0.091205,0.440278,0.712711,0.69961,-0.09079
wt,-0.867659,0.782496,0.88798,0.658748,-0.712441,1.0,-0.174716,-0.554916,-0.692495,-0.583287,0.427606
qsec,0.418684,-0.591242,-0.433698,-0.708223,0.091205,-0.174716,1.0,0.744535,-0.229861,-0.212682,-0.656249
vs,0.664039,-0.810812,-0.710416,-0.723097,0.440278,-0.554916,0.744535,1.0,0.168345,0.206023,-0.569607
am,0.599832,-0.522607,-0.591227,-0.243204,0.712711,-0.692495,-0.229861,0.168345,1.0,0.794059,0.057534
gear,0.480285,-0.492687,-0.555569,-0.125704,0.69961,-0.583287,-0.212682,0.206023,0.794059,1.0,0.274073


Lets add a new random column to the dataset.

In [5]:
dat['noise'] = np.random.random(dat.shape[0])
dat

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,noise
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4,0.235964
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4,0.011495
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1,0.718024
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1,0.526234
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2,0.307828
5,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1,0.847708
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4,0.391585
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2,0.316219
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2,0.931933
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4,0.505859


And compute correlations again.

In [6]:
cor = dat.corr(numeric_only = True)
cor

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,noise
mpg,1.0,-0.852162,-0.847551,-0.776168,0.681172,-0.867659,0.418684,0.664039,0.599832,0.480285,-0.550925,-0.072452
cyl,-0.852162,1.0,0.902033,0.832447,-0.699938,0.782496,-0.591242,-0.810812,-0.522607,-0.492687,0.526988,-0.097097
disp,-0.847551,0.902033,1.0,0.790949,-0.710214,0.88798,-0.433698,-0.710416,-0.591227,-0.555569,0.394977,-0.076602
hp,-0.776168,0.832447,0.790949,1.0,-0.448759,0.658748,-0.708223,-0.723097,-0.243204,-0.125704,0.749812,-0.093968
drat,0.681172,-0.699938,-0.710214,-0.448759,1.0,-0.712441,0.091205,0.440278,0.712711,0.69961,-0.09079,-0.090676
wt,-0.867659,0.782496,0.88798,0.658748,-0.712441,1.0,-0.174716,-0.554916,-0.692495,-0.583287,0.427606,0.030196
qsec,0.418684,-0.591242,-0.433698,-0.708223,0.091205,-0.174716,1.0,0.744535,-0.229861,-0.212682,-0.656249,0.281763
vs,0.664039,-0.810812,-0.710416,-0.723097,0.440278,-0.554916,0.744535,1.0,0.168345,0.206023,-0.569607,0.246177
am,0.599832,-0.522607,-0.591227,-0.243204,0.712711,-0.692495,-0.229861,0.168345,1.0,0.794059,0.057534,-0.163588
gear,0.480285,-0.492687,-0.555569,-0.125704,0.69961,-0.583287,-0.212682,0.206023,0.794059,1.0,0.274073,0.014629


Let's assume our target here is ant.

In [7]:
target = 'mpg'

Finally, let's define a minimum correlation threshold and detect the variables that fall below it.

In [None]:
threshold = FILL
irrelevant_variables = FILL
irrelevant_variables

['noise']

## Define Custom Function

In [None]:
def irrelevant_variables(X, target, threshold = 0.05):
    FILL

In [None]:
dat_new = irrelevant_variables(dat, 'mpg')
dat_new

Variables ['noise'] have been removed from dataset.


Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [None]:
dat_new = irrelevant_variables(dat, 'mpg', threshold = 0.5)
dat_new

Variables ['qsec', 'gear', 'noise'] have been removed from dataset.


Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,vs,am,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,0,1,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,0,1,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,1,1,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,1,0,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,0,0,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,1,0,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,0,0,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,1,0,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,1,0,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,1,0,4
