In [1]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy import stats

from multiprocessing import cpu_count

# Remove Constant Variables

Constant variables does not contain useful information when trying to predict or extract patterns from data. These variables should be eliminated to reduce computational and memory costs, as well as helping the model to focus on relevant variables.

## Load Data

In [2]:
dat = pd.read_csv('../data/userbase.csv', sep = ";")
dat

Unnamed: 0,user,booking_date,origin_airport,price,sales channel,company,user_country,index
0,user5,2018-11-01,MAD,58.200001,online,I2,,1
1,user7,2018-11-01,DUB,147.500000,online,I2,,2
2,user4,2018-11-02,TFS,24.049999,online,I2,,3
3,user8,2018-10-29,MAD,59.709999,online,I2,,4
4,user7,2018-11-01,LPA,37.299999,call center,I2,,5
...,...,...,...,...,...,...,...,...
995,user2,2018-11-01,JMK,99.849998,online,I2,,996
996,user10,2018-11-01,SVQ,34.610001,online,I2,,997
997,user4,2018-10-30,MAD,49.880001,online,I2,,998
998,user10,2018-11-02,CDG,152.960007,online,I2,,999


## Detect Constant Variables

Let's count the number of unique (distinct) values of each variable of the dataset.

In [3]:
dat.apply(lambda x: len(x.unique()))

user                10
booking_date         6
origin_airport      52
price              890
sales channel        3
company              1
user_country         2
index             1000
dtype: int64

airline has only one value! This means it is a constant variable.

More generally, we can detect constant variables on a dataset doing this:

In [4]:
dat.apply(lambda x: len(x.unique())) == 1

user              False
booking_date      False
origin_airport    False
price             False
sales channel     False
company            True
user_country      False
index             False
dtype: bool

In [5]:
constant_variables = dat.columns.values[dat.apply(lambda x: len(x.unique())) == 1].tolist()
constant_variables

['company']

## Remove Constant Variables

Let's use the previously computed list with constant column names to remove these variables from our dataset.

In [6]:
new_dat = dat.drop(constant_variables, axis = 1)
new_dat

Unnamed: 0,user,booking_date,origin_airport,price,sales channel,user_country,index
0,user5,2018-11-01,MAD,58.200001,online,,1
1,user7,2018-11-01,DUB,147.500000,online,,2
2,user4,2018-11-02,TFS,24.049999,online,,3
3,user8,2018-10-29,MAD,59.709999,online,,4
4,user7,2018-11-01,LPA,37.299999,call center,,5
...,...,...,...,...,...,...,...
995,user2,2018-11-01,JMK,99.849998,online,,996
996,user10,2018-11-01,SVQ,34.610001,online,,997
997,user4,2018-10-30,MAD,49.880001,online,,998
998,user10,2018-11-02,CDG,152.960007,online,,999


## Define Custom Function

Let's create our own custom function to remove constant variables.

In [8]:
def remove_constant(X):
    constant_variables = X.columns.values[X.apply(lambda x: len(x.unique())) == 1].tolist()
    X = X.drop(constant_variables, axis = 1)
    print('Variables ' + str(constant_variables) + ' have been removed from dataset.')
    return X



In [9]:
new_dat = remove_constant(dat)
new_dat.head()

Variables ['company'] have been removed from dataset.


Unnamed: 0,user,booking_date,origin_airport,price,sales channel,user_country,index
0,user5,2018-11-01,MAD,58.200001,online,,1
1,user7,2018-11-01,DUB,147.5,online,,2
2,user4,2018-11-02,TFS,24.049999,online,,3
3,user8,2018-10-29,MAD,59.709999,online,,4
4,user7,2018-11-01,LPA,37.299999,call center,,5
