In [1]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy import stats

from multiprocessing import cpu_count

# Remove Not Informed Variables

Most machine learning models do not allow for missing values in the input data.

In categorical variables, missing values can be transformed into a new category 'Missing', solving this way the issue.

For numerical variables, these missing values can be filled using different approaches (see notebook 5) but if a variable has too many missing values these approaches are no longer useful and the best approach would be to remove the variable entirely from the dataset.

Let's see here how to select variables with too many missing values and remove them from our dataset.

## Load Data

In [2]:
from google.colab import files
uploaded = files.upload()

Saving userbase.csv to userbase.csv


In [3]:
import io
dat = pd.read_csv(io.BytesIO(uploaded['userbase.csv']), sep = ";")
dat

Unnamed: 0,user,booking_date,origin_airport,price,sales channel,company,user_country,index
0,user5,2018-11-01,MAD,58.200001,online,I2,,1
1,user7,2018-11-01,DUB,147.500000,online,I2,,2
2,user4,2018-11-02,TFS,24.049999,online,I2,,3
3,user8,2018-10-29,MAD,59.709999,online,I2,,4
4,user7,2018-11-01,LPA,37.299999,call center,I2,,5
...,...,...,...,...,...,...,...,...
995,user2,2018-11-01,JMK,99.849998,online,I2,,996
996,user10,2018-11-01,SVQ,34.610001,online,I2,,997
997,user4,2018-10-30,MAD,49.880001,online,I2,,998
998,user10,2018-11-02,CDG,152.960007,online,I2,,999


## Detect Missing Values

Let's count the number of missing values of each variable of the dataset.

In [4]:
dat.apply(lambda x: 100*np.sum(x.isna())/len(x))

user               0.0
booking_date       0.0
origin_airport     0.0
price              1.4
sales channel      0.0
company            0.0
user_country      99.0
index              0.0
dtype: float64

user_country has 99% of missing values! It may be better to remove it from the dataset...

## Detect Non-informed Variables

More generally, we can detect non-informed variables on a dataset doing this:

In [5]:
missing_values_threshold = 50 # You should try different values on a real problem.

In [6]:
non_informed_variables = dat.columns.values[dat.apply(lambda x: 100*np.sum(x.isna())/len(x)) >= missing_values_threshold].tolist()
non_informed_variables

['user_country']

## Remove Non-informed Variables

Let's use the previously computed list with non-informed column names to remove these variables from our dataset.

In [7]:
new_dat = dat.drop(non_informed_variables, axis = 1)
new_dat

Unnamed: 0,user,booking_date,origin_airport,price,sales channel,company,index
0,user5,2018-11-01,MAD,58.200001,online,I2,1
1,user7,2018-11-01,DUB,147.500000,online,I2,2
2,user4,2018-11-02,TFS,24.049999,online,I2,3
3,user8,2018-10-29,MAD,59.709999,online,I2,4
4,user7,2018-11-01,LPA,37.299999,call center,I2,5
...,...,...,...,...,...,...,...
995,user2,2018-11-01,JMK,99.849998,online,I2,996
996,user10,2018-11-01,SVQ,34.610001,online,I2,997
997,user4,2018-10-30,MAD,49.880001,online,I2,998
998,user10,2018-11-02,CDG,152.960007,online,I2,999


## Define Custom Function

Let's create our own custom function to remove non-informed variables.

In [8]:
def count_nas(x):
    ret = 100*np.sum(x.isna())/len(x)
    return ret

In [9]:
def remove_non_informed(X, missing_values_threshold = 50):
    non_informed_variables = X.columns.values[X.apply(lambda x: 100*np.sum(x.isna())/len(x)) >= missing_values_threshold].tolist()
    X = X.drop(non_informed_variables, axis = 1)
    print('Variables ' + str(non_informed_variables) + ' have been removed from dataset.')
    return X;



In [10]:
new_dat = remove_non_informed(dat)
new_dat.head()

Variables ['user_country'] have been removed from dataset.


Unnamed: 0,user,booking_date,origin_airport,price,sales channel,company,index
0,user5,2018-11-01,MAD,58.200001,online,I2,1
1,user7,2018-11-01,DUB,147.5,online,I2,2
2,user4,2018-11-02,TFS,24.049999,online,I2,3
3,user8,2018-10-29,MAD,59.709999,online,I2,4
4,user7,2018-11-01,LPA,37.299999,call center,I2,5


In [11]:
new_dat = remove_non_informed(dat, missing_values_threshold = 99.9)
new_dat.head()

Variables [] have been removed from dataset.


Unnamed: 0,user,booking_date,origin_airport,price,sales channel,company,user_country,index
0,user5,2018-11-01,MAD,58.200001,online,I2,,1
1,user7,2018-11-01,DUB,147.5,online,I2,,2
2,user4,2018-11-02,TFS,24.049999,online,I2,,3
3,user8,2018-10-29,MAD,59.709999,online,I2,,4
4,user7,2018-11-01,LPA,37.299999,call center,I2,,5
