# Preprocesamiento

# Librerías

In [90]:
# general
import numpy as np
import pandas as pd
import re
import os
import timeit
import scipy.stats as stats
import pylab

# preprocesamiento
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif,chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.preprocessing import KBinsDiscretizer

# ML
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor



# funciones
from data_exploration import explore
from feature_cleaning import missing_data, outlier, rare_values
from feature_cleaning import missing_data as ms
from feature_cleaning import outlier as ot
from feature_cleaning import rare_values as ra
from feature_engineering import discretization as dc

from feature_engineering import discretization, encoding, transformation
from feature_selection import embedded_method, feature_shuffle, filter_method, hybrid


# Limpieza de datos

In [23]:
# cargar bases de datos 
df=pd.read_csv("data/presidents.csv")
df.head()

# crear columna "First" como copia de columna  'President'
df["First"]=df['President']

# Agregar los nombres de Presidentes con regex=True
# reemplazar "[ ].*", "" con regex
# extraer primer nombre
df["First"]=df["First"].replace("[ ].*", "", regex=True)
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James


In [24]:
# borrar columna "First" 
del(df["First"])

# hacer metodo splitname y agregar 2 columnas al DataFrame

# funcion splitname(row): split fila en columnas row['First'], row['Last']
# parametros: fila de DataFrame

def splitname(row):
    
    # crear columna 'First' en la fila DataFrame
    # aplicar split(" ") sobre la columna row['President'] primera posición  
    row['First']=row['President'].split(" ")[0]
    
    # crear columna 'Last' en la fila DataFrame
    # aplicar split(" ") sobre la columna row['President'] última posición  
    row['Last']=row['President'].split(" ")[-1]
    
    return row # returns: fila modificada

# aplicar un método sobre el DataFrame, sobre las filas y/o columnas
# función: splitname
# axis= 'columns' columnas
df=df.apply(splitname, axis='columns')
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe


In [25]:
# quitar columnas de DataFrame
del(df['First'])
del(df['Last'])

# patron de 3 grupos, retorna primer y segundo nombre
pattern="(^[\w]*)(?:.* )([\w]*$)"

# extraer patron de columna "President"
df["President"].str.extract(pattern).head()

Unnamed: 0,0,1
0,George,Washington
1,John,Adams
2,Thomas,Jefferson
3,James,Madison
4,James,Monroe


In [26]:
# patron de 3 grupos, retorna primer y segundo nombre
# nombres de grupos First, Last
pattern="(?P<First>^[\w]*)(?:.* )(?P<Last>[\w]*$)"

# Now call extract
names=df["President"].str.extract(pattern).head()
names

# crear columnas "First", "Last"
df["First"]=names["First"]
df["Last"]=names["Last"]
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe


In [27]:
# extraer patron Month Day Year  de columna "Born"
df["Born"]=df["Born"].str.extract("([\w]{3} [\w]{1,2}, [\w]{4})")
df["Born"].head()


# convertir columna df["Born"] a objeto datetime
df["Born"]=pd.to_datetime(df["Born"])
df["Born"].head()

0   1732-02-22
1   1735-10-30
2   1743-04-13
3   1751-03-16
4   1758-04-28
Name: Born, dtype: datetime64[ns]

## Missing Values

Tratamiento de valores NAs y NULL

In [28]:
# cargar base de datos
df = pd.read_csv('data/class_grades.csv')
df.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [29]:
# crear máscara booleana sobre valores NULL en DataFrame
mask=df.isnull()
mask.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,True,False,False
3,False,True,True,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,True,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [30]:
# quitar filas con algun valor NA
df.dropna().head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61
10,7,80.44,90.2,75.0,91.48,39.72
12,8,97.16,103.71,72.5,93.52,63.33
13,7,91.28,83.53,81.25,99.81,92.22


In [31]:
# cambiar NAs en DataFrame por valor por parámetros
# inplace=True  modifica el DataFrame
df.fillna(0, inplace=True)
df.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,0.0,63.15,48.89
3,7,0.0,0.0,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,0.0,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [32]:
df = pd.read_csv("data/log.csv")
df.head(20)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,


In [33]:
# set indice a columna 'time' 
df = df.set_index('time')


# organizar indice 
df = df.sort_index()
df.head(20)

Unnamed: 0_level_0,user,video,playback position,paused,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,
1469974514,cheryl,intro.html,8,,
1469974524,sue,advanced.html,25,,
1469974544,cheryl,intro.html,9,,
1469974554,sue,advanced.html,26,,
1469974574,cheryl,intro.html,10,,


In [34]:
# reiniciar el indice numerico 
df = df.reset_index()

# usar multi-indices con columnas 'time', 'user' 
df = df.set_index(['time', 'user'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,video,playback position,paused,volume
time,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,
1469974514,cheryl,intro.html,8,,
1469974524,sue,advanced.html,25,,
1469974544,cheryl,intro.html,9,,
1469974554,sue,advanced.html,26,,
1469974574,cheryl,intro.html,10,,


In [35]:
# modificar los NAs con método ffill
df = df.fillna(method='ffill')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,video,playback position,paused,volume
time,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,False,10.0
1469974454,sue,advanced.html,24,False,10.0
1469974484,cheryl,intro.html,7,False,10.0
1469974514,cheryl,intro.html,8,False,10.0
1469974524,sue,advanced.html,25,False,10.0
1469974544,cheryl,intro.html,9,False,10.0
1469974554,sue,advanced.html,26,False,10.0
1469974574,cheryl,intro.html,10,False,10.0


In [36]:
# hacer DataFrame con dict 
df = pd.DataFrame({'A': [1, 1, 2, 3, 4],
                   'B': [3, 6, 3, 8, 9],
                   'C': ['a', 'b', 'c', 'd', 'e']})

# reemplazar valores 1 con 100
df.replace(1, 100)

Unnamed: 0,A,B,C
0,100,3,a
1,100,6,b
2,2,3,c
3,3,8,d
4,4,9,e


In [37]:
# reemplazar valores 1 con 3, valores 100 con 300 
df.replace([1, 3], [100, 300])

Unnamed: 0,A,B,C
0,100,300,a
1,100,6,b
2,2,300,c
3,300,8,d
4,4,9,e


In [38]:
# cargar base 
df = pd.read_csv("data/log.csv")
df.head(20)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,


In [39]:
# reemplazar ".*.html$" con "webpage"
# regex=true
df.replace(to_replace=".*.html$", value="webpage", regex=True)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,webpage,5,False,10.0
1,1469974454,cheryl,webpage,6,,
2,1469974544,cheryl,webpage,9,,
3,1469974574,cheryl,webpage,10,,
4,1469977514,bob,webpage,1,,
5,1469977544,bob,webpage,1,,
6,1469977574,bob,webpage,1,,
7,1469977604,bob,webpage,1,,
8,1469974604,cheryl,webpage,11,,
9,1469974694,cheryl,webpage,14,,


In [40]:
# columnas en DataFrame
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

data = pd.read_csv('data/titanic.csv', usecols=use_cols)
print(data.shape)
data

(891, 6)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.2500
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.9250
3,1,1,female,35.0,1,53.1000
4,0,3,male,35.0,0,8.0500
...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000
887,1,1,female,19.0,0,30.0000
888,0,3,female,,1,23.4500
889,1,1,male,26.0,0,30.0000


In [41]:
# contar NAs por columna(variable) y proporción
ms.check_missing(data=data,output_path=r'./data/outs')


result saved at ./data/outs missing.csv


Unnamed: 0,total missing,proportion
Survived,0,0.0
Pclass,0,0.0
Sex,0,0.0
Age,177,0.198653
SibSp,0,0.0
Fare,0,0.0


### Listwise deletion

Quitar filas con NAs

In [42]:
data2 = ms.drop_missing(data=data)
print(data2.shape)
data2

(714, 6)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.2500
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.9250
3,1,1,female,35.0,1,53.1000
4,0,3,male,35.0,0,8.0500
...,...,...,...,...,...,...
885,0,3,female,39.0,0,29.1250
886,0,2,male,27.0,0,13.0000
887,1,1,female,19.0,0,30.0000
889,1,1,male,26.0,0,30.0000


In [43]:
# Age_is_NA is created, 0-not missing 1-missing for that observation

# crear columna 'Age_is_NA' con NA
# 1-NA, 0-no NA
data3 = ms.add_var_denote_NA(data=data,NA_col=['Age'])
print(data3.Age_is_NA.value_counts())
data3.head(8)

0    714
1    177
Name: Age_is_NA, dtype: int64


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_is_NA
0,0,3,male,22.0,1,7.25,0
1,1,1,female,38.0,1,71.2833,0
2,1,3,female,26.0,0,7.925,0
3,1,1,female,35.0,1,53.1,0
4,0,3,male,35.0,0,8.05,0
5,0,3,male,,0,8.4583,1
6,0,1,male,54.0,0,51.8625,0
7,0,3,male,2.0,3,21.075,0


### Imputación Arbitraria 
Reemplazar valores de NA con valor arbitrario

In [44]:
data4 = ms.impute_NA_with_arbitrary(data=data,impute_value=-999,NA_col=['Age'])
data4.head(8)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_-999
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,-999.0
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0


### Imputación con Promedio/Mediana/Moda 
Reemplazar NA por promedio/mediana/moda

In [45]:
print(data.Age.median())
data5 = ms.impute_NA_with_avg(data=data,strategy='median',NA_col=['Age'])
data5.head(8)

28.0


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_impute_median
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,28.0
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0


### Imputación con Cola de Distribución

In [46]:
data6 = ms.impute_NA_with_end_of_distribution(data=data,NA_col=['Age'])
data6.head(8)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_impute_end_of_distri
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,73.27861
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0


### Imputación Aleatoria
Reemplazar NA por muestreo aleatorio

In [47]:
data7 = ms.impute_NA_with_random(data=data,NA_col=['Age'])
data7

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_random
0,0,3,male,22.0,1,7.2500,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.9250,26.0
3,1,1,female,35.0,1,53.1000,35.0
4,0,3,male,35.0,0,8.0500,35.0
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000,27.0
887,1,1,female,19.0,0,30.0000,19.0
888,0,3,female,,1,23.4500,15.0
889,1,1,male,26.0,0,30.0000,26.0


## Outliers

Tratamiento de valores anormales

In [48]:
# columnas en la tabla
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

# cargar base de datos
data = pd.read_csv('./data/titanic.csv', usecols=use_cols)
data.head(3)
print(data.shape)
data

(891, 6)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.2500
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.9250
3,1,1,female,35.0,1,53.1000
4,0,3,male,35.0,0,8.0500
...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000
887,1,1,female,19.0,0,30.0000
888,0,3,female,,1,23.4500
889,1,1,male,26.0,0,30.0000


In [49]:
# crear Serie
# valores unicos de columna 'Fare' ordenados
pd.Series(data.Fare.unique()).sort_values()

104      0.0000
163      4.0125
245      5.0000
152      6.2375
240      6.4375
         ...   
164    227.5250
75     247.5208
148    262.3750
23     263.0000
127    512.3292
Length: 248, dtype: float64

### Detección por Límite Arbitrario

Detección de outliers por límites superior e inferior por parámetro 

In [50]:
# detectar outliers por limite arbitrario por columna(variable)
# upper_fence=limite superior, lower_fence=limite inferior

#  returns: index=indices de outliers detectados en columna
# para: parámetros de detección
index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [51]:
# check los outliers detectados
data.loc[index,'Fare'].sort_values()

179      0.0000
806      0.0000
732      0.0000
674      0.0000
633      0.0000
597      0.0000
815      0.0000
466      0.0000
481      0.0000
302      0.0000
277      0.0000
271      0.0000
263      0.0000
413      0.0000
822      0.0000
378      4.0125
679    512.3292
737    512.3292
258    512.3292
Name: Fare, dtype: float64

### Método IQR

Detección de outliers por la Regla de Rangos Intercuantiles

In [52]:
# detectar outliers por limite arbitrario por columna(variable)
# threshold=umbral intercuantil

#  returns: index=indices de outliers detectados en columna
# para: parámetros de detección

index,para = ot.outlier_detect_IQR(data=data,col='Fare',threshold=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 31
Proportion of outlier detected 0.03479236812570146
Upper bound: 146.448 
Lower bound: -107.53760000000001


In [53]:
# check los outliers detectados
data.loc[index,'Fare'].sort_values()

31     146.5208
195    146.5208
305    151.5500
708    151.5500
297    151.5500
498    151.5500
609    153.4625
332    153.4625
268    153.4625
318    164.8667
856    164.8667
730    211.3375
779    211.3375
689    211.3375
377    211.5000
527    221.7792
700    227.5250
716    227.5250
557    227.5250
380    227.5250
299    247.5208
118    247.5208
311    262.3750
742    262.3750
341    263.0000
88     263.0000
438    263.0000
27     263.0000
679    512.3292
258    512.3292
737    512.3292
Name: Fare, dtype: float64

### Método de Promedio y Desviación Estándar

Detección de outliers por Promedio y Desviación Estándar

In [54]:
# detectar outliers por promedio y desviación estandar por columna(variable)
# threshold=umbral

#  returns: index=indices de outliers detectados en columna
# para: parámetros de detección
index,para = ot.outlier_detect_mean_std(data=data,col='Fare',threshold=3)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 20
Proportion of outlier detected 0.02244668911335578
Upper bound: 181.28449376011736 
Lower bound: -116.87607782296807


In [55]:
# check outliers
data.loc[index,'Fare'].sort_values()

779    211.3375
730    211.3375
689    211.3375
377    211.5000
527    221.7792
716    227.5250
700    227.5250
380    227.5250
557    227.5250
118    247.5208
299    247.5208
311    262.3750
742    262.3750
27     263.0000
341    263.0000
88     263.0000
438    263.0000
258    512.3292
737    512.3292
679    512.3292
Name: Fare, dtype: float64

### Método MAD

Detección de outliers por Mediana y Desviación de Mediana Absoluta (MAD)

In [56]:
# detectar outliers por Mediana y MAD por columna(variable)
# threshold=umbral

#  returns: index=indices de outliers detectados en columna
# para: parámetros de detección
index = ot.outlier_detect_MAD(data=data,col='Fare',threshold=3.5)

Num of outlier detected: 160
Proportion of outlier detected 0.17957351290684623


In [57]:
# check outliers
data.loc[index,'Fare'].sort_values()

867     50.4958
571     51.4792
6       51.8625
457     51.8625
669     52.0000
         ...   
27     263.0000
438    263.0000
737    512.3292
679    512.3292
258    512.3292
Name: Fare, Length: 160, dtype: float64

### Imputación con Valor Arbitrario

Cambiar valor de outlier por valor arbitrario por parámetro

In [58]:
# detectar outliers (arbitrario)
index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [59]:
# cambiar outliers por valor arbitrario
# outlier_index= indice de outliers detectados en columna(variable)
# value=valor arbitrario por parámetro,col=['columna']
data2 = ot.impute_outlier_with_arbitrary(data=data,outlier_index=index,
                                         value=-999,col=['Fare'])
data2[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,-999.0
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,-999.0
264,0,3,female,,0,7.75


###  Windsorization

Cambiar valor de outlier con valores max-min de la distribución 

In [60]:
# detectar outliers
index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [61]:
# see index 258,263,271 have been replaced with top/bottom coding

# cambiar valor de outliers por  valor top/bottom de distribución
# para=para,strategy='both'
data3 = ot.windsorization(data=data,col='Fare',para=para,strategy='both')
data3[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,300.0
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,5.0
264,0,3,female,,0,7.75


### Descartar Outliers

Quitar outliers de tabla 

In [62]:
# detectar outliers
index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [63]:
# quitar outliers
data4 = ot.drop_outlier(data=data,outlier_index=index)
print(data4.Fare.max())
print(data4.Fare.min())

263.0
5.0


### Imputación con Promedio/Mediana/Moda

In [64]:
# detectar outliers
index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [65]:

# cambiar valor de outliers por promedio de distribución
data5 = ot.impute_outlier_with_avg(data=data,col='Fare',
                                   outlier_index=index,strategy='mean')
data5[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,32.204208
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,32.204208
264,0,3,female,,0,7.75


## Valores Raros (atípicos)

In [66]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

# see column Pclass & SibSp's distributions
# SibSp has values 3/8/5 that occur rarely, under 2%
# Pclass has 3 values, but no one is under 20%
data = pd.read_csv('./data/titanic.csv', usecols=use_cols)
for i in ['Pclass','SibSp']:
    print('Variable',i,'label proportion:')
    print(data[i].value_counts()/len(data))

Variable Pclass label proportion:
3    0.551066
1    0.242424
2    0.206510
Name: Pclass, dtype: float64
Variable SibSp label proportion:
0    0.682379
1    0.234568
2    0.031425
4    0.020202
3    0.017957
8    0.007856
5    0.005612
Name: SibSp, dtype: float64


In [67]:

# crear codificación 
# agrupar filas(observaciones) con labels raros en categoría única
enc = ra.GroupingRareValues(cols=['Pclass','SibSp'],threshold=0.01).fit(data)
print(enc.mapping)

[{'col': 'Pclass', 'mapping': 3    3
1    1
2    2
dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0       0
1       1
2       2
4       4
3       3
8    rare
5    rare
dtype: object, 'data_type': dtype('int64')}]


In [68]:
# transformar base de datos
data2 = enc.transform(data)
# check 
print(data2.SibSp.value_counts())

0       608
1       209
2        28
4        18
3        16
rare     12
Name: SibSp, dtype: int64


### Imputación con Moda

Cambiar valor de categoria rara con moda

In [69]:
# aplicar imputación por moda 
# threshold= umbral
enc = ra.ModeImputation(cols=['Pclass','SibSp'],threshold=0.01).fit(data)

print(enc.mapping)

[{'col': 'Pclass', 'mapping': 3    3
1    1
2    2
dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0    0
1    1
2    2
4    4
3    3
8    0
5    0
dtype: int64, 'data_type': dtype('int64')}]


In [70]:
# transformar base de datos
data3 = enc.transform(data)

# check
print(data3.SibSp.value_counts())

0    620
1    209
2     28
4     18
3     16
Name: SibSp, dtype: int64


## Alta Cardinalidad

# Ingeniería de Variables (Feature Engineering)

* Escalamiento/Estandarización:
* Discretización:
* Codificación:
* Transformación:
* Generación: 


## Escalamiento

Escalar/estandarizar variables numéricas

In [71]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

data = pd.read_csv('./data/titanic.csv', usecols=use_cols)
data


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.2500
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.9250
3,1,1,female,35.0,1,53.1000
4,0,3,male,35.0,0,8.0500
...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000
887,1,1,female,19.0,0,30.0000
888,0,3,female,,1,23.4500
889,1,1,male,26.0,0,30.0000


In [72]:
# separar y dependiente, x variables
x = data.drop(['Survived'], axis=1)
y = data['Survived']


In [73]:
# dividir base de datos en datos de entrenamiento y datos de prueba
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
                                                    random_state=0)
x_train.shape,x_test.shape 

((712, 5), (179, 5))

### Estandarización/Normalización

Quitar la media(promedio) y escalar variables con varianza unitaria

$z = (X - X.mean) /  std$ 


In [74]:
# crear escalador Estandar
ss = StandardScaler().fit(x_train[['Fare']])
x_train_copy = x_train.copy(deep=True)
x_train_copy['Fare_zscore'] = ss.transform(x_train_copy[['Fare']])
print(x_train_copy.head(6))

print('promedio',x_train_copy['Fare_zscore'].mean())
print('desviación estándar',x_train_copy['Fare_zscore'].std())


     Pclass     Sex   Age  SibSp     Fare  Fare_zscore
140       3  female   NaN      0  15.2458    -0.331679
439       2    male  31.0      0  10.5000    -0.426405
817       2    male  31.0      1  37.0042     0.102620
378       3    male  20.0      0   4.0125    -0.555896
491       3    male  21.0      0   7.2500    -0.491276
331       1    male  45.5      0  28.5000    -0.067125
promedio 1.0478509445900354e-16
desviación estándar 1.0007029877845377


### Escalador Min-Max

Escalar variables en un rango dado.
Default [0,1]

$ Xscaled = (X - X.min / (X.max - X.min)$

In [80]:
# crear escalador Min-Max
# ajustar con columnas numericas ('Fare)
mms = MinMaxScaler().fit(x_train[['Fare']])
x_train_copy = x_train.copy(deep=True)
x_train_copy['Fare_minmax'] = mms.transform(x_train_copy[['Fare']])
print(x_train_copy.head(6))



     Pclass     Sex   Age  SibSp     Fare  Fare_minmax
140       3  female   NaN      0  15.2458     0.029758
439       2    male  31.0      0  10.5000     0.020495
817       2    male  31.0      1  37.0042     0.072227
378       3    male  20.0      0   4.0125     0.007832
491       3    male  21.0      0   7.2500     0.014151
331       1    male  45.5      0  28.5000     0.055628


In [81]:
# check rango de variable escalada
print(X_train_copy['Fare_minmax'].max())
print(X_train_copy['Fare_minmax'].min())

1.0
0.0


### Escalador Robusto

Quita la mediana y escala las variables con el rango cuantil.
Default IQR

$Xscaled =(x-x.median)/IQR$



In [85]:
# crear escalador robusto
rs = RobustScaler().fit(x_train[['Fare']])
x_train_copy = x_train.copy(deep=True)
x_train_copy['Fare_robust'] = rs.transform(x_train_copy[['Fare']])
print(x_train_copy.head(6))


     Pclass     Sex   Age  SibSp     Fare  Fare_robust
140       3  female   NaN      0  15.2458     0.033803
439       2    male  31.0      0  10.5000    -0.169380
817       2    male  31.0      1  37.0042     0.965349
378       3    male  20.0      0   4.0125    -0.447130
491       3    male  21.0      0   7.2500    -0.308522
331       1    male  45.5      0  28.5000     0.601257


## Discretización

In [91]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

data = pd.read_csv('./data/titanic.csv', usecols=use_cols)


In [92]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((712, 5), (179, 5))

### Bins de igual tamaño

Dividir variable numérica en bins iguales

In [96]:
# dividir variable en bins(intervalos) iguales
# n_bins= numero de bins, encode= tipo de codificación
# strategy= estrategia de codificación
enc_equal_width = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform').fit(X_train[['Fare']])





In [94]:
enc_equal_width.bin_edges_

array([array([  0.    , 170.7764, 341.5528, 512.3292])], dtype=object)

In [95]:
result = enc_equal_width.transform(X_train[['Fare']])
pd.DataFrame(result)[0].value_counts()

0.0    697
1.0     12
2.0      3
Name: 0, dtype: int64

In [98]:
# add the new discretized variable
# agregar nueva variable(columna) discretizada
x_train_copy = x_train.copy(deep=True)
x_train_copy['Fare_equal_width'] = enc_equal_width.transform(x_train[['Fare']])
x_train_copy

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Fare_equal_width
140,3,female,,0,15.2458,0.0
439,2,male,31.0,0,10.5000,0.0
817,2,male,31.0,1,37.0042,0.0
378,3,male,20.0,0,4.0125,0.0
491,3,male,21.0,0,7.2500,0.0
...,...,...,...,...,...,...
835,1,female,39.0,1,83.1583,0.0
192,3,female,19.0,1,7.8542,0.0
629,3,male,,0,7.7333,0.0
559,3,female,36.0,1,17.4000,0.0


### Bins de igual frecuencia

Dividir variable numérica en bins iguales con el mismo número de observaciones(filas)

In [99]:
# dividir variable en bins con el mismo numero de observaciones
# n_bins= numero de bins, encode= tipo de codificación
# strategy= estrategia de codificación (cuantiles)
enc_equal_freq = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='quantile').fit(x_train[['Fare']])

In [100]:
# check bins 
enc_equal_freq.bin_edges_

array([array([  0.    ,   8.6625,  26.    , 512.3292])], dtype=object)

In [102]:
# mismo numero de filas en cada bin
result = enc_equal_freq.transform(x_train[['Fare']])
pd.DataFrame(result)[0].value_counts()

2.0    256
0.0    237
1.0    219
Name: 0, dtype: int64

In [104]:
# agregar variable(columna ) discretizada
x_train_copy = x_train.copy(deep=True)
x_train_copy['Fare_equal_freq'] = enc_equal_freq.transform(x_train[['Fare']])
x_train_copy

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Fare_equal_freq
140,3,female,,0,15.2458,1.0
439,2,male,31.0,0,10.5000,1.0
817,2,male,31.0,1,37.0042,2.0
378,3,male,20.0,0,4.0125,0.0
491,3,male,21.0,0,7.2500,0.0
...,...,...,...,...,...,...
835,1,female,39.0,1,83.1583,2.0
192,3,female,19.0,1,7.8542,0.0
629,3,male,,0,7.7333,0.0
559,3,female,36.0,1,17.4000,1.0


### Bins por K-Means 

Usar K-Means para generar valores de partición en **clusters**

In [105]:
# dividir variable en bins por K-Means
# n_bins= numero de bins, encode= tipo de codificación
# strategy= estrategia de codificación (kmeans)
enc_kmeans = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='kmeans').fit(X_train[['Fare']])

[WinError 2] El sistema no puede encontrar el archivo especificado
  File "c:\Users\Diego\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 199, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\Diego\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Diego\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Diego\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [106]:
# check bins 
enc_kmeans.bin_edges_

array([array([  0.        ,  91.14736819, 336.18681923, 512.3292    ])],
      dtype=object)

In [107]:
# transformar variable
result = enc_kmeans.transform(x_train[['Fare']])
pd.DataFrame(result)[0].value_counts()

0.0    670
1.0     39
2.0      3
Name: 0, dtype: int64

In [109]:
# agregar variable(columna)  discretizada
x_train_copy = x_train.copy(deep=True)
x_train_copy['Fare_kmeans'] = enc_kmeans.transform(x_train[['Fare']])
print(x_train_copy.head(10))

     Pclass     Sex   Age  SibSp     Fare  Fare_kmeans
140       3  female   NaN      0  15.2458          0.0
439       2    male  31.0      0  10.5000          0.0
817       2    male  31.0      1  37.0042          0.0
378       3    male  20.0      0   4.0125          0.0
491       3    male  21.0      0   7.2500          0.0
331       1    male  45.5      0  28.5000          0.0
588       3    male  22.0      0   8.0500          0.0
358       3  female   NaN      0   7.8792          0.0
674       2    male   NaN      0   0.0000          0.0
162       3    male  26.0      0   7.7750          0.0


## Codificación

## Transformaciones

## Feature Generation

# Selección de variables

## Método de Filtro

## Método 'Wrapper'

## Método Embebido

## Mezcla de variables

## Método Híbrido