In [None]:
%%capture
## Ejecutar esta celda para instalar o actualizar Feature_Engine
!pip install -U feature_engine

In [23]:
## Chequear que la versión de Feature Engine sea al menos 1.7
import feature_engine

feature_engine.__version__

'1.7.0'

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import set_config

## (Opcional) Este comando permite que el output de Scikit-Learn sean Pandas DataFrames.
## Por dejecto, Scikit-Learn transforma todo a Numpy, ya que es más eficiente computacionalmente.
set_config(transform_output="pandas")

df = sns.load_dataset("titanic")
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


* Para realizar cualquier tipo de Preprocesamiento mostraremos 2 procedimientos: `Pandas` y `Scikit-Learn`.

* Ambos permiten el mismo tipo de proceso, la gran diferencia es que Scikit-Learn permite combinar distintos preprocesamientos en un Pipeline y es la manera más `Profesional` de hacer este proceso. 

> Usar Pipelines permitirá evitar errores en procedimientos más complicados que veremos más adelante durante el curso.


## Valores Faltantes

In [25]:
## Para detectar valores faltantes se utiliza el siguiente comando.
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [26]:
## Opcionalmente se puede obtener el % o la fracción de nulos utilizando la siguiente variante.
df.isnull().mean()

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

`Pandas`: Es posible imputar valores usando Pandas con el comando `.fillna()`.

In [27]:
media = df["age"].mean()
mediana = df["age"].median()
print(f"Promedio de Edad: {media}")
print(
    f'Promedio de Edad con Imputación con Ceros: {df["age"].fillna(0).mean()}'
)
print(
    f'Promedio de Edad con Imputación por Media: {df["age"].fillna(media).mean()}'
)
print(
    f'Promedio de Edad con Imputación por Mediana: {df["age"].fillna(mediana).mean()}'
)

Promedio de Edad: 29.69911764705882
Promedio de Edad con Imputación con Ceros: 23.79929292929293
Promedio de Edad con Imputación por Media: 29.69911764705882
Promedio de Edad con Imputación por Mediana: 29.36158249158249


`Scikit-Learn`: Utiliza la clase `SimpleImputer`, el cual permite distintas estrategias de Imputación: `"mean",` `"median",` `"most_frequent",` `"constant"`.

In [28]:
from sklearn.impute import SimpleImputer

sc = SimpleImputer(strategy="mean")
## En este caso uso [[]] ya que Scikit Learn espera Matrices o DataFrames.
## Utilizar [[]] fuerza a que AGE sea un DataFrame de una Columna y no una Serie.

data_imputed = sc.fit_transform(df[["age"]])
## Se puede ver que los nuevos datos ya no poseen valores Perdidos.
data_imputed.isnull().sum()

age    0
dtype: int64

## Outliers

`pandas`: En Pandas se pueden acotar los outliers utilizando `.clip()`

In [29]:
print(f"Promedio de Tarifas: {df.fare.mean()}")
df["fare"].agg(["min", "max"])

Promedio de Tarifas: 32.204207968574636


min      0.0000
max    512.3292
Name: fare, dtype: float64

* **lower**: Define la cota inferior.
* **upper**: Define la cota superior.

In [30]:
clipped_data = df[["fare"]].clip(lower=10, upper=50)
clipped_data.agg(["min", "max"])

Unnamed: 0,fare
min,10.0
max,50.0


In [31]:
df[["fare"]]

Unnamed: 0,fare
0,7.2500
1,71.2833
2,7.9250
3,53.1000
4,8.0500
...,...
886,13.0000
887,30.0000
888,23.4500
889,30.0000


In [32]:
## Los valores menores a 10 fueron reemplazados por 10.
## Los valores mayores a 50 fueron reemplazados por 50.
clipped_data

Unnamed: 0,fare
0,10.00
1,50.00
2,10.00
3,50.00
4,10.00
...,...
886,13.00
887,30.00
888,23.45
889,30.00


`sklearn`: Para este caso nos apoyaremos de la librería `feature_engine` la cual posee herramientas para acotar. `feature_engine` sigue exactamente la misma lógica de `Scikit-Learn`.

In [33]:
from feature_engine.outliers import ArbitraryOutlierCapper, Winsorizer

capper = ArbitraryOutlierCapper(
    max_capping_dict=dict(fare=50), min_capping_dict=dict(fare=10)
)
capper.fit_transform(df[["fare"]])

Unnamed: 0,fare
0,10.00
1,50.00
2,10.00
3,50.00
4,10.00
...,...
886,13.00
887,30.00
888,23.45
889,30.00


* **capping_method**: Define la Estragegia a utilizar para el Winsorizer. Ver [Docs](https://feature-engine.trainindata.com/en/latest/api_doc/outliers/Winsorizer.html#feature_engine.outliers.Winsorize).

In [34]:
## "gaussian" permite acotar por mu +/- 3*std
## "iqr" permite rellenar por Q1 - 3*iqr y Q3 + 3*iqr
win = Winsorizer(capping_method="gaussian")
win.fit_transform(df[["fare"]])

Unnamed: 0,fare
0,7.2500
1,71.2833
2,7.9250
3,53.1000
4,8.0500
...,...
886,13.0000
887,30.0000
888,23.4500
889,30.0000


## Variables Categóricas

`pandas`:

### One Hot Encoding

Para la conversión de variables categóricas utilizamos `pd.get_dummies()`.
* **drop_first**: Si es `True` se elimina la primera categoría.

In [35]:
pd.get_dummies(df["embark_town"], drop_first=False)

Unnamed: 0,Cherbourg,Queenstown,Southampton
0,False,False,True
1,True,False,False
2,False,False,True
3,False,False,True
4,False,False,True
...,...,...,...
886,False,False,True
887,False,False,True
888,False,False,True
889,True,False,False


In [36]:
pd.get_dummies(df["embark_town"], drop_first=True)
# Una ventaja de este procedimiento es que no considera los Nulos como otra categoría...

Unnamed: 0,Queenstown,Southampton
0,False,True
1,False,False
2,False,True
3,False,True
4,False,True
...,...,...
886,False,True
887,False,True
888,False,True
889,False,False


### Ordinal Encoder
Se utiliza `pd.factorize()`.
* **sort**: Usar `True` ya que coloca las categorías en orden. Además de esta manera se comporta igual que `OrdinalEncoder` de `Scikit-Learn`.

In [37]:
pd.DataFrame(
    pd.factorize(df["embark_town"], sort=True)[0], columns=["new_column"]
)

Unnamed: 0,new_column
0,2
1,0
2,2
3,2
4,2
...,...
886,2
887,2
888,2
889,0


`Scikit-Learn`:

### One Hot Encoding

* **sparse_output**: Se debe fijar como False para poder ver el output como Pandas
* **drop**: Se debe colocar `"first"` o el nombre de *una sóla categoría* a eliminar.

In [38]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

ohe = OneHotEncoder(drop="first", sparse_output=False)
ohe.fit_transform(df[["embark_town"]])

Unnamed: 0,embark_town_Queenstown,embark_town_Southampton,embark_town_nan
0,0.0,1.0,0.0
1,0.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
886,0.0,1.0,0.0
887,0.0,1.0,0.0
888,0.0,1.0,0.0
889,0.0,0.0,0.0


### Ordinal Encoder

In [39]:
ohe = OneHotEncoder(
    drop=["Queenstown"], sparse_output=False
)  # También se puede colocar np.nan.
ohe.fit_transform(df[["embark_town"]])

Unnamed: 0,embark_town_Cherbourg,embark_town_Southampton,embark_town_nan
0,0.0,1.0,0.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
886,0.0,1.0,0.0
887,0.0,1.0,0.0
888,0.0,1.0,0.0
889,1.0,0.0,0.0


In [40]:
oe = OrdinalEncoder()
oe.fit_transform(df[["embark_town"]])

Unnamed: 0,embark_town
0,2.0
1,0.0
2,2.0
3,2.0
4,2.0
...,...
886,2.0
887,2.0
888,2.0
889,0.0


## Escalamiento

El escalamiento normalmente se realiza sólo en `Scikit-Learn`. Se mostrarán la `Estandarización` y `Normalización`.

In [41]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Llamaremos esto Estandarización... (sólo por convención del curso)
sc = StandardScaler()
data = sc.fit_transform(df[["fare"]])
data.agg(["mean", "std"])

Unnamed: 0,fare
mean,3.9873330000000004e-18
std,1.000562


In [42]:
## Llamaremos esto Normalización... (sólo por convención del curso)
mms = MinMaxScaler()
mms.fit_transform(df[["fare"]]).agg(["min", "max"])

Unnamed: 0,fare
min,0.0
max,1.0


## Aplicar Preprocesamientos sólo a algunas variables.

`Scikit-Learn` fue diseñado para el entrenamiento eficiente de modelos. Para ello, se basó en Numpy, el cuál no cuenta con nombre de columnas, por lo que para poder aplicar pre-procesamientos a ciertas partes del Dataset utiliza lo que se llama el `ColumnTransformer()`, el cuál va más allá del alcance del curso.

Para simplificar el proceso de elegir ciertas columnas, `feature_engine` posee una el `SklearnTransformerWrapper` que permite elegir qué variables queremos pasar por cierta transformación. 

In [43]:
## Sin SklearnTransformerWrapper

ohe = OneHotEncoder(sparse_output=False)
ohe.fit_transform(df[["age", "embark_town"]])
## Crea columnas dummies incluso para las variables numéricas.

Unnamed: 0,age_0.42,age_0.67,age_0.75,age_0.83,age_0.92,age_1.0,age_2.0,age_3.0,age_4.0,age_5.0,...,age_70.0,age_70.5,age_71.0,age_74.0,age_80.0,age_nan,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,embark_town_nan
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [44]:
## Aplicar preprocesamientos a ciertas variables...
from feature_engine.wrappers import SklearnTransformerWrapper

ohe_w = SklearnTransformerWrapper(
    OneHotEncoder(sparse_output=False), variables="embark_town"
)
ohe_w.fit_transform(df[["age", "embark_town"]])
## Crea dummies sólo para la variable embark_town y deja age como estaba.

Unnamed: 0,age,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,embark_town_nan
0,22.0,0.0,0.0,1.0,0.0
1,38.0,1.0,0.0,0.0,0.0
2,26.0,0.0,0.0,1.0,0.0
3,35.0,0.0,0.0,1.0,0.0
4,35.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...
886,27.0,0.0,0.0,1.0,0.0
887,19.0,0.0,0.0,1.0,0.0
888,,0.0,0.0,1.0,0.0
889,26.0,1.0,0.0,0.0,0.0
