# Outlier removers

Find examples on how to use the different outlier removers in Feature-engine.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from feature_engine import outlier_removers as outr

In [2]:
# Load titanic dataset from OpenML

def load_titanic():
    data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    data = data.replace('?', np.nan)
    data['cabin'] = data['cabin'].astype(str).str[0]
    data['pclass'] = data['pclass'].astype('O')
    data['embarked'].fillna('C', inplace=True)
    data['fare'] = data['fare'].astype('float')
    data['fare'].fillna(data['fare'].median(), inplace=True)
    data['age'] = data['age'].astype('float')
    data['age'].fillna(data['age'].median(), inplace=True)
    return data

In [3]:
data = load_titanic()
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C,S,,,"Montreal, PQ / Chesterville, ON"


In [4]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['survived', 'name', 'ticket'], axis=1), data['survived'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((916, 11), (393, 11))

## Winsorizer

Winzorizer finds maximum and minimum values following a Gaussian or skewed distribution as indicated. It can also cap the right, left or both ends of the distribution.

In [5]:
# let's find out the maximum Age and maximum Fare in the titanic
data.age.max(), data.fare.max()

(80.0, 512.3292)

### Gaussian distribution and right tail

In [6]:
capper = outr.Winsorizer(
    distribution='gaussian', tail='right', fold=3, variables=['age', 'fare'])

capper.fit(X_train)

Winsorizer(distribution='gaussian', fold=3, tail='right',
           variables=['age', 'fare'])

In [7]:
# here we can find the maximum caps allowed
capper.right_tail_caps_

{'age': 67.49048447470311, 'fare': 174.78162171790427}

In [8]:
# this dictionary is empty, because we selected only right tail
capper.left_tail_caps_

{}

In [9]:
train_t = capper.transform(X_train)
test_t = capper.transform(X_test)

# let's check the new maximum Age and maximum Fare in the titanic
train_t.age.max(), train_t.fare.max()

(67.49048447470311, 174.78162171790427)

### Gaussian distribution, both tails

In [10]:
winsor = outr.Winsorizer(distribution='gaussian',
                         tail='both', fold=1, variables='fare')

winsor.fit(X_train)

Winsorizer(distribution='gaussian', fold=1, tail='both', variables=['fare'])

In [11]:
winsor.left_tail_caps_

{'fare': -14.884473469286917}

In [12]:
winsor.right_tail_caps_

{'fare': 79.94857412430875}

In [13]:
train_t = winsor.transform(X_train) 
test_t = winsor.transform(X_test) 

train_t.fare.max(), train_t.fare.min()

(79.94857412430875, 0.0)

### Skewed distribution, left tail

In [14]:
winsor = outr.Winsorizer(distribution='skewed',
                          tail='left', fold=1, variables=['age', 'fare'])

winsor.fit(X_train)

Winsorizer(distribution='skewed', fold=1, tail='left',
           variables=['age', 'fare'])

In [15]:
# right tail dictionary is empty, because we selected only left tail
winsor.right_tail_caps_

{}

In [16]:
winsor.left_tail_caps_

{'age': 11.0, 'fare': -15.483399999999996}

In [17]:
train_t = winsor.transform(X_train) 
test_t = winsor.transform(X_test) 

train_t.fare.max(), train_t.fare.min()

(512.3292, 0.0)

## ArbitraryOutlierCapper

The ArbitraryOutlierCapper caps the minimum and maximum values by a value determined by the user. 

In [18]:
# let's find out the maximum Age and maximum Fare in the titanic
X_train.age.max(), X_train.fare.max()

(74.0, 512.3292)

In [19]:
capper = outr.ArbitraryOutlierCapper(
    max_capping_dict={'age': 50, 'fare': 200}, min_capping_dict=None)

capper.fit(X_train)

ArbitraryOutlierCapper(max_capping_dict={'age': 50, 'fare': 200},
                       min_capping_dict=None)

In [20]:
capper.right_tail_caps_

{'age': 50, 'fare': 200}

In [21]:
capper.left_tail_caps_

{}

In [22]:
train_t = capper.transform(X_train) 
test_t = capper.transform(X_test) 

train_t.fare.max(), train_t.age.max()

(200.0, 50.0)

### Minimum capping

In [23]:
capper = outr.ArbitraryOutlierCapper(max_capping_dict=None,
                                     min_capping_dict={
                                         'age': 10,
                                         'fare': 100
                                     })

capper.fit(X_train)

ArbitraryOutlierCapper(max_capping_dict=None,
                       min_capping_dict={'age': 10, 'fare': 100})

In [24]:
capper.variables

['age', 'fare']

In [25]:
capper.right_tail_caps_

{}

In [26]:
train_t = capper.transform(X_train) 
test_t = capper.transform(X_test) 

train_t.fare.min(), train_t.age.min()

(100.0, 10.0)

### Both ends capping

In [27]:
capper = outr.ArbitraryOutlierCapper(max_capping_dict={
                                     'age': 50, 'fare': 200},
                                     min_capping_dict={
                                      'age': 10, 'fare': 100})

capper.fit(X_train)

ArbitraryOutlierCapper(max_capping_dict={'age': 50, 'fare': 200},
                       min_capping_dict={'age': 10, 'fare': 100})

In [28]:
capper.right_tail_caps_

{'age': 50, 'fare': 200}

In [29]:
capper.left_tail_caps_

{'age': 10, 'fare': 100}

In [30]:
train_t = capper.transform(X_train) 
test_t = capper.transform(X_test) 

train_t.fare.max(), train_t.age.max(), train_t.fare.min(), train_t.age.min()

(200.0, 50.0, 100.0, 10.0)