In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from importlib import reload

pd.set_option('display.max_columns', None)

import sys
sys.path.insert(1,'C:/Users/Analytics10/LV_Projects/feature_engine/feature_engine')

from missing_data_imputation import *
from categorical_encoder import CategoricalEncoder, RareLabelEncoder

In [2]:
data = pd.read_csv('titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# makes binary variable where there is a high number of missing obs 
# within a variable

na_captures = na_capturer()
data = na_captures.fit_transform(data)

In [4]:
# removes na in categorical variables (repalces by "Missing")
cat_imputer = CategoricalImputer()
data = cat_imputer.fit_transform(data)

In [5]:
# replaces NA by random sampling in remaining numerical variables with na
# (Age) in this case

random_imputer = RandomSampleImputer()
random_imputer.fit(data)
data = random_imputer.transform(data, random_state=0)
data.isnull().sum() #  check absence of na

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Age_na         0
Cabin_na       0
dtype: int64

In [6]:
# extract first letter from cabin

data['Cabin'] = data['Cabin'].astype(str).str[0]
data['Cabin'].unique()

array(['M', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

### Rare value encoder

In [7]:
data['Cabin'].value_counts() / np.float(len(data))

M    0.771044
C    0.066218
B    0.052750
D    0.037037
E    0.035915
A    0.016835
F    0.014590
G    0.004489
T    0.001122
Name: Cabin, dtype: float64

In [8]:
rare_encoder = RareLabelEncoder(tol = 0.05, n_categories=5)
rare_encoder.fit(data, variables = ['Cabin'])

RareLabelEncoder(n_categories=5, tol=0.05)

In [9]:
test = rare_encoder.transform(data)
test.Cabin.value_counts()

M       687
Rare     98
C        59
B        47
Name: Cabin, dtype: int64

In [13]:
rare_encoder = RareLabelEncoder(tol = 0.05, n_categories=10)
rare_encoder.fit(data)

['Sex', 'Cabin', 'Embarked']




RareLabelEncoder(n_categories=10, tol=0.05)

In [14]:
rare_encoder.variables_

['Name', 'Ticket']

In [10]:
rare_encoder = RareLabelEncoder(tol = 0.05, n_categories=10)
rare_encoder.fit(data, variables = ['Cabin'])

['Cabin']




ValueError: No variables to evaluate, check the cardinality of the variables and change the n_categories argument accordingly

In [11]:
rare_encoder.transform(data)

NotFittedError: This RareLabelEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

### Count encoding

In [12]:
CategoricalEncoder(encoding_method='other')

ValueError: encoding_method takes only values 'count', 'frequency','ordinal','mean','ratio','woe'

In [10]:
count_encoder = CategoricalEncoder(encoding_method='count')
count_encoder.fit(data, variables = ['Pclass', 'Sex', 'Cabin'])



CategoricalEncoder(encoding_method='count', tol=0.0001)

In [11]:
count_encoder.variables_

['Pclass', 'Sex', 'Cabin']

In [12]:
count_encoder.encoder_dict_

{'Cabin': {'A': 15,
  'B': 47,
  'C': 59,
  'D': 33,
  'E': 32,
  'F': 13,
  'G': 4,
  'M': 687,
  'T': 1},
 'Pclass': {1: 216, 2: 184, 3: 491},
 'Sex': {'female': 314, 'male': 577}}

In [13]:
d1 = count_encoder.transform(data)
d1['Cabin'].unique()

array([687,  59,  32,   4,  33,  15,  47,  13,   1], dtype=int64)

In [14]:
count_encoder.transform(data[['Pclass', 'Sex', 'Cabin']])

ValueError: Number of columns in dataset is different from training set  used to fit the encoder

### frequency encoder

In [15]:
count_encoder = CategoricalEncoder(encoding_method='frequency')
count_encoder.fit(data, variables = ['Pclass', 'Sex', 'Cabin'])



CategoricalEncoder(encoding_method='frequency', tol=0.0001)

In [16]:
count_encoder.encoder_dict_

{'Cabin': {'A': 0.016835016835016835,
  'B': 0.052749719416386086,
  'C': 0.066217732884399555,
  'D': 0.037037037037037035,
  'E': 0.035914702581369251,
  'F': 0.014590347923681257,
  'G': 0.0044893378226711564,
  'M': 0.77104377104377109,
  'T': 0.0011223344556677891},
 'Pclass': {1: 0.24242424242424243,
  2: 0.20650953984287318,
  3: 0.55106621773288444},
 'Sex': {'female': 0.35241301907968575, 'male': 0.6475869809203143}}

In [17]:
d1 = count_encoder.transform(data)
d1['Cabin'].unique()

array([ 0.77104377,  0.06621773,  0.0359147 ,  0.00448934,  0.03703704,
        0.01683502,  0.05274972,  0.01459035,  0.00112233])

### mean encoding

In [18]:
count_encoder = CategoricalEncoder(encoding_method='mean')
count_encoder.fit(X=data, y=data.Survived, variables = ['Pclass', 'Sex', 'Cabin'])



CategoricalEncoder(encoding_method='mean', tol=0.0001)

In [19]:
count_encoder.encoder_dict_

{'Cabin': {'A': 0.46666666666666667,
  'B': 0.74468085106382975,
  'C': 0.59322033898305082,
  'D': 0.75757575757575757,
  'E': 0.75,
  'F': 0.61538461538461542,
  'G': 0.5,
  'M': 0.29985443959243085,
  'T': 0.0},
 'Pclass': {1: 0.62962962962962965,
  2: 0.47282608695652173,
  3: 0.24236252545824846},
 'Sex': {'female': 0.7420382165605095, 'male': 0.18890814558058924}}

In [20]:
d1 = count_encoder.transform(data)
d1['Cabin'].unique()

array([ 0.29985444,  0.59322034,  0.75      ,  0.5       ,  0.75757576,
        0.46666667,  0.74468085,  0.61538462,  0.        ])

In [21]:
d1['Sex'].unique()

array([ 0.18890815,  0.74203822])

### ratio encoder

In [22]:
count_encoder = CategoricalEncoder(encoding_method='ratio')
count_encoder.fit(X=data, y=data.Survived, variables = ['Pclass', 'Sex', 'Cabin'])



CategoricalEncoder(encoding_method='ratio', tol=0.0001)

In [23]:
count_encoder.encoder_dict_

{'Cabin': {'A': 0.875,
  'B': 2.9166666666666661,
  'C': 1.4583333333333333,
  'D': 3.125,
  'E': 3.0,
  'F': 1.6000000000000003,
  'G': 1.0,
  'M': 0.42827442827442824,
  'T': 0.0},
 'Pclass': {1: 1.7000000000000002,
  2: 0.89690721649484528,
  3: 0.31989247311827956},
 'Sex': {'female': 2.8765432098765422, 'male': 0.23290598290598288}}

In [24]:
d1 = count_encoder.transform(data)
d1['Cabin'].unique()

array([ 0.42827443,  1.45833333,  3.        ,  1.        ,  3.125     ,
        0.875     ,  2.91666667,  1.6       ,  0.        ])

### woe

In [25]:
count_encoder = CategoricalEncoder(encoding_method='woe')
count_encoder.fit(X=data, y=data.Survived, variables = ['Pclass', 'Sex', 'Cabin'])



CategoricalEncoder(encoding_method='woe', tol=0.0001)

In [26]:
count_encoder.encoder_dict_

{'Cabin': {'A': -0.13353139262452263,
  'B': 1.0704414117014132,
  'C': 0.37729423114146798,
  'D': 1.1394342831883648,
  'E': 1.0986122886681098,
  'F': 0.47000362924573574,
  'G': 0.0,
  'M': -0.84799110131618016,
  'T': -9.2103403719761818},
 'Pclass': {1: 0.53062825106217049,
  2: -0.10880285984879919,
  3: -1.1397703611616172},
 'Sex': {'female': 1.0565892988932615, 'male': -1.4571204136885032}}

In [27]:
d1 = count_encoder.transform(data)
d1['Cabin'].unique()

array([-0.8479911 ,  0.37729423,  1.09861229,  0.        ,  1.13943428,
       -0.13353139,  1.07044141,  0.47000363, -9.21034037])

### Ordinal encoding

In [28]:
count_encoder = CategoricalEncoder(encoding_method='ordinal')
count_encoder.fit(X=data, y=data.Survived, variables = ['Pclass', 'Sex', 'Cabin'])



CategoricalEncoder(encoding_method='ordinal', tol=0.0001)

In [29]:
count_encoder.encoder_dict_

{'Cabin': {'A': 2,
  'B': 6,
  'C': 4,
  'D': 8,
  'E': 7,
  'F': 5,
  'G': 3,
  'M': 1,
  'T': 0},
 'Pclass': {1: 2, 2: 1, 3: 0},
 'Sex': {'female': 1, 'male': 0}}

In [30]:
count_encoder = CategoricalEncoder(encoding_method='ordinal')
count_encoder.fit(X=data, y=data.Survived)

CategoricalEncoder(encoding_method='ordinal', tol=0.0001)

In [31]:
count_encoder.variables_

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

- Add rare label encoder