In [1]:
# Load Libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

from sklearn.model_selection import train_test_split
from scipy import stats

from multiprocessing import cpu_count

# Group Categories

When doing one-hot encoding you can find yourself with a dataset with too many categories, which results in a pre-processed data with an unmanageable number of variables.

To avoid this, grouping some of the existing categories can be useful.

## Load Data

In [2]:
dat = pd.read_csv('../data/userbase.csv', sep = ";")
dat

Unnamed: 0,user,booking_date,origin_airport,price,sales channel,company,user_country,index
0,user5,2018-11-01,MAD,58.200001,online,I2,,1
1,user7,2018-11-01,DUB,147.500000,online,I2,,2
2,user4,2018-11-02,TFS,24.049999,online,I2,,3
3,user8,2018-10-29,MAD,59.709999,online,I2,,4
4,user7,2018-11-01,LPA,37.299999,call center,I2,,5
...,...,...,...,...,...,...,...,...
995,user2,2018-11-01,JMK,99.849998,online,I2,,996
996,user10,2018-11-01,SVQ,34.610001,online,I2,,997
997,user4,2018-10-30,MAD,49.880001,online,I2,,998
998,user10,2018-11-02,CDG,152.960007,online,I2,,999


## Check number of categories

Let's count the number of categories of each categorical variable of the dataset.

In [3]:
categorical_variables = dat.columns[dat.dtypes == object].tolist()
categorical_variables

['user',
 'booking_date',
 'origin_airport',
 'sales channel',
 'company',
 'user_country']

In [4]:
dat[categorical_variables].apply(lambda x: len(x.unique()))

user              10
booking_date       6
origin_airport    52
sales channel      3
company            1
user_country       2
dtype: int64

origin_airport has 53 categories. This could be too much...

## Group less frequent categories

One approach to group categories is to put all the least frequent values on a miscellaneous or OTHERS group. For that, we have to first check the frequency of each category.

In [5]:
frequencies = 100*dat['origin_airport'].value_counts()/dat.shape[0]
frequencies

origin_airport
MAD    50.2
PMI     5.3
TFN     5.2
LPA     4.0
SVQ     3.4
SCQ     3.1
TXL     2.5
AMS     2.1
TFS     2.0
LGW     1.9
AGP     1.7
VGO     1.6
ALC     1.4
IBZ     1.2
CDG     1.1
DUB     1.1
ACE     1.0
SPC     1.0
FUE     0.9
MAH     0.8
CPH     0.8
NAP     0.7
NTE     0.6
NCE     0.5
DUS     0.5
MAN     0.5
XRY     0.5
LYS     0.5
STR     0.4
JMK     0.4
TLS     0.4
FRA     0.2
GIG     0.2
LGA     0.2
CAG     0.2
BRU     0.2
RNS     0.2
SDU     0.1
KRK     0.1
CWB     0.1
BPS     0.1
CMH     0.1
GRX     0.1
OVD     0.1
RIX     0.1
MIA     0.1
BHX     0.1
CLT     0.1
PHL     0.1
GRU     0.1
CGR     0.1
BOD     0.1
Name: count, dtype: float64

Now let's check which categories do not surpass a certain frequency threshold.

In [6]:
threshold = 0.5
least_frequent = frequencies.index[frequencies < threshold].to_list()
set(least_frequent) - untouchable
least_frequent

NameError: name 'untouchable' is not defined

Now let's goup these categories into *OTHERS*.

In [7]:
dat_new = dat.copy()
dat_new['origin_airport'][np.isin(dat_new['origin_airport'] , least_frequent)] = 'OTHERS'
dat_new

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat_new['origin_airport'][np.isin(dat_new['origin_airport'] , least_frequent)] = 'OTHERS'


Unnamed: 0,user,booking_date,origin_airport,price,sales channel,company,user_country,index
0,user5,2018-11-01,MAD,58.200001,online,I2,,1
1,user7,2018-11-01,DUB,147.500000,online,I2,,2
2,user4,2018-11-02,TFS,24.049999,online,I2,,3
3,user8,2018-10-29,MAD,59.709999,online,I2,,4
4,user7,2018-11-01,LPA,37.299999,call center,I2,,5
...,...,...,...,...,...,...,...,...
995,user2,2018-11-01,OTHERS,99.849998,online,I2,,996
996,user10,2018-11-01,SVQ,34.610001,online,I2,,997
997,user4,2018-10-30,MAD,49.880001,online,I2,,998
998,user10,2018-11-02,CDG,152.960007,online,I2,,999


In [8]:
100*dat_new['origin_airport'].value_counts()/dat.shape[0]

origin_airport
MAD       50.2
PMI        5.3
TFN        5.2
LPA        4.0
OTHERS     3.9
SVQ        3.4
SCQ        3.1
TXL        2.5
AMS        2.1
TFS        2.0
LGW        1.9
AGP        1.7
VGO        1.6
ALC        1.4
IBZ        1.2
CDG        1.1
DUB        1.1
ACE        1.0
SPC        1.0
FUE        0.9
MAH        0.8
CPH        0.8
NAP        0.7
NTE        0.6
LYS        0.5
XRY        0.5
MAN        0.5
DUS        0.5
NCE        0.5
Name: count, dtype: float64

Putting all together.

**Exercise**: Implement the code to be able to carry out the previous computation for all categorical variables in dat. Then use use to implement your own custom function to group categories.

## Define Custom Function

Let's create our own custom function to group categories.

**EXERCISE**

In [9]:
def group_column(X, threshold = 0.1):
    frequencies = 100*X.value_counts()/dat.shape[0]
    least_frequent = frequencies.index[frequencies < threshold].to_list()
    X[np.isin(X , least_frequent)] = 'OTHERS'
    return X

In [10]:
group_column(dat['sales channel'], threshold = 10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[np.isin(X , least_frequent)] = 'OTHERS'


0      online
1      online
2      online
3      online
4      OTHERS
        ...  
995    online
996    online
997    online
998    online
999    online
Name: sales channel, Length: 1000, dtype: object

In [11]:
def group_categories(X, threshold = 0.1):
    categorical_variables = dat.columns[np.logical_or(dat.dtypes == object, dat.dtypes == str)].tolist()
    for c in categorical_variables:
        X[c] = group_column(X[c],
                           threshold = threshold)
    return X


In [12]:
dat_new = group_categories(dat, threshold = 0.2)
dat_new

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[np.isin(X , least_frequent)] = 'OTHERS'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[np.isin(X , least_frequent)] = 'OTHERS'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[np.isin(X , least_frequent)] = 'OTHERS'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[np.isin(X , least_frequent)] = 'OTHERS'


Unnamed: 0,user,booking_date,origin_airport,price,sales channel,company,user_country,index
0,user5,2018-11-01,MAD,58.200001,online,I2,,1
1,user7,2018-11-01,DUB,147.500000,online,I2,,2
2,user4,2018-11-02,TFS,24.049999,online,I2,,3
3,user8,2018-10-29,MAD,59.709999,online,I2,,4
4,user7,2018-11-01,LPA,37.299999,OTHERS,I2,,5
...,...,...,...,...,...,...,...,...
995,user2,2018-11-01,JMK,99.849998,online,I2,,996
996,user10,2018-11-01,SVQ,34.610001,online,I2,,997
997,user4,2018-10-30,MAD,49.880001,online,I2,,998
998,user10,2018-11-02,CDG,152.960007,online,I2,,999


In [13]:
def group_categories_pro(X, threshold = 0.1):
    categorical_variables = dat.columns[np.logical_or(dat.dtypes == object, dat.dtypes == str)].tolist()
    X[categorical_variables] = X[categorical_variables].apply(group_column)
    return X


In [14]:
dat_new = group_categories_pro(dat, threshold = 1)
dat_new

Unnamed: 0,user,booking_date,origin_airport,price,sales channel,company,user_country,index
0,user5,2018-11-01,MAD,58.200001,online,I2,,1
1,user7,2018-11-01,DUB,147.500000,online,I2,,2
2,user4,2018-11-02,TFS,24.049999,online,I2,,3
3,user8,2018-10-29,MAD,59.709999,online,I2,,4
4,user7,2018-11-01,LPA,37.299999,OTHERS,I2,,5
...,...,...,...,...,...,...,...,...
995,user2,2018-11-01,JMK,99.849998,online,I2,,996
996,user10,2018-11-01,SVQ,34.610001,online,I2,,997
997,user4,2018-10-30,MAD,49.880001,online,I2,,998
998,user10,2018-11-02,CDG,152.960007,online,I2,,999
