# Preprocessing Pipelines 

# Librerias

In [206]:
# general
import numpy as np
import pandas as pd
import re
import os
import timeit



# visualización
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import warn
plt.style.use('seaborn-colorblind')
#from data_exploration import explore


# preprocesamiento
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif,chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# ML
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error
import pylab
from sklearn.tree import DecisionTreeRegressor

# ML pipelines
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer


from sklearn import set_config

  plt.style.use('seaborn-colorblind')


# Cargar Bases de datos (Tablas)

In [2]:
# cargar base de datos
data = pd.read_csv('./data/titanic.csv')
data.head(3)
print(data.shape)
data

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Preparar base de datos para Pipelines

## Separar y dependiente, x variables

In [87]:
x = data.drop("Survived", axis=1)
y = data["Survived"]
x,y

(     PassengerId  Pclass                                               Name  \
 1              2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
 3              4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
 6              7       1                            McCarthy, Mr. Timothy J   
 10            11       3                    Sandstrom, Miss. Marguerite Rut   
 11            12       1                           Bonnell, Miss. Elizabeth   
 ..           ...     ...                                                ...   
 871          872       1   Beckwith, Mrs. Richard Leonard (Sallie Monypeny)   
 872          873       1                           Carlsson, Mr. Frans Olof   
 879          880       1      Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)   
 887          888       1                       Graham, Miss. Margaret Edith   
 889          890       1                              Behr, Mr. Karl Howell   
 
         Sex   Age  SibSp  Parch    Ti

## Separar bases de datos (tablas) en datos de entrenamiento y datos de prueba

In [89]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


## Extraer tipos de variables

In [144]:
# extraer nombres de columnas por tipo 
# retorna tupla de listas con nombres de columnas por tipo

def columns_types(data):
    
    # definir estrucutra de datos por tipo de variable
    numeric_types=['int', 'int32', 'int64', 'float', 'float32', 'float64']
    category_types = ['object','category']
    str_types = ['string']
    bool_types = ['bool']
    date_types = ['datetime64', 'datetime', 'datetime64[ns]']
    
    # listas de nombres de variables
    numeric_columns = []
    str_columns = []
    bool_columns = []
    date_columns = []
    category_columns = []

    for column in data.columns:
        
        col_type = str(data[column].dtype)
        
        if any(col_type.startswith(t) for t in numeric_types) or any(col_type.startswith(t) for t in numeric_types):
            numeric_columns.append(column)
        elif any(col_type.startswith(t) for t in str_types):
            str_columns.append(column)
        elif any(col_type.startswith(t) for t in bool_types):
            bool_columns.append(column)
        elif any(col_type.startswith(t) for t in date_types):
            date_columns.append(column)
        elif any(col_type.startswith(t) for t in category_types):
            category_columns.append(column)
            

    return numeric_columns, category_columns, str_columns,bool_columns, date_columns



In [146]:
# extraer nombres de columnas por tipo 
numeric_columns, category_columns, str_columns,bool_columns, date_columns = columns_types(data)
numeric_columns, category_columns, str_columns,bool_columns, date_columns



(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'],
 ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],
 [],
 [],
 [])

# Pipelines de preprocesamiento

## Pipeline de Tratamiento de NA (Missing Values)

In [45]:
# crear pipeline de tratamiento Missing Values
# crear SimpleImputer
# imputacion simple, estrategia: mediana
imputer = SimpleImputer(strategy="median")
print('Tipo de Imputación de Missing Value NA:  ',imputer.strategy)
imputer

Tipo de Imputación de Missing Value NA:   median


In [46]:
# extraer columnas numericas de base de datos
columnas_numericas1 = data.select_dtypes(include=[np.number])
print(type(columnas_numericas1))
columnas_numericas1

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.2500
1,2,1,1,38.0,1,0,71.2833
2,3,1,3,26.0,0,0,7.9250
3,4,1,1,35.0,1,0,53.1000
4,5,0,3,35.0,0,0,8.0500
...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.0000
887,888,1,1,19.0,0,0,30.0000
888,889,0,3,,1,2,23.4500
889,890,1,1,26.0,0,0,30.0000


In [47]:
# transformar DataFrame de variables numéricas con pipeline de Missing Values
var_num_preprocess1 = imputer.fit_transform(columnas_numericas1)
var_num_preprocess1

array([[  1.    ,   0.    ,   3.    , ...,   1.    ,   0.    ,   7.25  ],
       [  2.    ,   1.    ,   1.    , ...,   1.    ,   0.    ,  71.2833],
       [  3.    ,   1.    ,   3.    , ...,   0.    ,   0.    ,   7.925 ],
       ...,
       [889.    ,   0.    ,   3.    , ...,   1.    ,   2.    ,  23.45  ],
       [890.    ,   1.    ,   1.    , ...,   0.    ,   0.    ,  30.    ],
       [891.    ,   0.    ,   3.    , ...,   0.    ,   0.    ,   7.75  ]])

In [48]:
# ver columnas transformadas por pipeline(imputer)
imputer.feature_names_in_

# crear DataFrame con variables numericas preprocesadas por pipeline(imputer)
data_preprocess1 = pd.DataFrame(var_num_preprocess1, columns=columnas_numericas1.columns,
                          index=columnas_numericas1.index)
data_preprocess1

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1.0,0.0,3.0,22.0,1.0,0.0,7.2500
1,2.0,1.0,1.0,38.0,1.0,0.0,71.2833
2,3.0,1.0,3.0,26.0,0.0,0.0,7.9250
3,4.0,1.0,1.0,35.0,1.0,0.0,53.1000
4,5.0,0.0,3.0,35.0,0.0,0.0,8.0500
...,...,...,...,...,...,...,...
886,887.0,0.0,2.0,27.0,0.0,0.0,13.0000
887,888.0,1.0,1.0,19.0,0.0,0.0,30.0000
888,889.0,0.0,3.0,28.0,1.0,2.0,23.4500
889,890.0,1.0,1.0,26.0,0.0,0.0,30.0000


In [49]:
numeric_columns, category_columns, str_columns,bool_columns, date_columns

(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'],
 ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'],
 [],
 [],
 [])

In [None]:
# quitar columna 'Sex'
#str_columns.remove('Sex')
#str_columns

## Variables numéricas 

In [50]:
# crear pipeline de variables numericas
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="mean")),
    ("standardize", StandardScaler()),
])

# crear pipeline de preprocesamiento de variables numericas
# pasos: imputacion simple por mediana, escalamiento estandar
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
num_pipeline

# ver pipeline
set_config(display='diagram')

# aplicar pipeline de preprocesamiento a variables numericas
numeric_preprocess1 = num_pipeline.fit_transform(data[numeric_columns])

# convertir columnas numericas preprocesadas a DataFrame (tabla)
df_numeric_preprocess = pd.DataFrame(numeric_preprocess1, columns=num_pipeline.get_feature_names_out())
df_numeric_preprocess

# ver pasos de preprocesamiento de pipeline de variables numericas
num_pipeline.steps
num_pipeline

In [51]:
# ver paso de pipeline
num_pipeline.named_steps["simpleimputer"]

# cambiar parametros de paso de pipeline
num_pipeline.set_params(simpleimputer__strategy="median")



## Variables categóricas

In [192]:
category_columns

['Sex', 'Cabin', 'Embarked']

In [165]:
data[category_columns]


Unnamed: 0,Sex,Cabin,Embarked
1,female,C85,C
3,female,C123,S
6,male,E46,S
10,female,G6,S
11,female,C103,S
...,...,...,...
871,female,D35,S
872,male,B51 B53 B55,S
879,female,C50,C
887,female,B42,S


In [168]:
data_copy = data.copy(deep=True)
data_copy = data.dropna()
data_copy


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
872,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [151]:
category_columns.remove('Name')
category_columns.remove('Ticket')

In [170]:
# crear codificador ordinal (categorias por orden)
ordinal_encoder = OrdinalEncoder()

# transformar variables categoricas 
columnas_cate_codificada1 = ordinal_encoder.fit_transform(data[category_columns])

# ver categorias de codificador
ordinal_encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['A10', 'A16', 'A20', 'A23', 'A24', 'A26', 'A31', 'A34', 'A36',
        'A5', 'A6', 'A7', 'B101', 'B18', 'B19', 'B20', 'B22', 'B3', 'B30',
        'B35', 'B37', 'B38', 'B39', 'B4', 'B41', 'B42', 'B49', 'B5', 'B50',
        'B51 B53 B55', 'B57 B59 B63 B66', 'B58 B60', 'B69', 'B71', 'B73',
        'B77', 'B79', 'B80', 'B82 B84', 'B86', 'B94', 'B96 B98', 'C101',
        'C103', 'C104', 'C110', 'C111', 'C118', 'C123', 'C124', 'C125',
        'C126', 'C148', 'C2', 'C22 C26', 'C23 C25 C27', 'C30', 'C32',
        'C45', 'C46', 'C49', 'C50', 'C52', 'C54', 'C62 C64', 'C65', 'C68',
        'C7', 'C70', 'C78', 'C82', 'C83', 'C85', 'C86', 'C87', 'C90',
        'C91', 'C92', 'C93', 'C99', 'D', 'D10 D12', 'D11', 'D15', 'D17',
        'D19', 'D20', 'D26', 'D28', 'D30', 'D33', 'D35', 'D36', 'D37',
        'D46', 'D47', 'D48', 'D49', 'D50', 'D56', 'D6', 'D7', 'D9', 'E10',
        'E101', 'E12', 'E121', 'E17', 'E24', 'E25', 'E31', 'E33', 'E34',
        'E

In [175]:
# crear codificador One-Hot (0-1)
onehot_encoder = OneHotEncoder()

# transformar variables categoricas 
columnas_cate_codificada2 = onehot_encoder.fit_transform(data[category_columns])

# ver categorias de codificador
onehot_encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['A10', 'A16', 'A20', 'A23', 'A24', 'A26', 'A31', 'A34', 'A36',
        'A5', 'A6', 'A7', 'B101', 'B18', 'B19', 'B20', 'B22', 'B3', 'B30',
        'B35', 'B37', 'B38', 'B39', 'B4', 'B41', 'B42', 'B49', 'B5', 'B50',
        'B51 B53 B55', 'B57 B59 B63 B66', 'B58 B60', 'B69', 'B71', 'B73',
        'B77', 'B79', 'B80', 'B82 B84', 'B86', 'B94', 'B96 B98', 'C101',
        'C103', 'C104', 'C110', 'C111', 'C118', 'C123', 'C124', 'C125',
        'C126', 'C148', 'C2', 'C22 C26', 'C23 C25 C27', 'C30', 'C32',
        'C45', 'C46', 'C49', 'C50', 'C52', 'C54', 'C62 C64', 'C65', 'C68',
        'C7', 'C70', 'C78', 'C82', 'C83', 'C85', 'C86', 'C87', 'C90',
        'C91', 'C92', 'C93', 'C99', 'D', 'D10 D12', 'D11', 'D15', 'D17',
        'D19', 'D20', 'D26', 'D28', 'D30', 'D33', 'D35', 'D36', 'D37',
        'D46', 'D47', 'D48', 'D49', 'D50', 'D56', 'D6', 'D7', 'D9', 'E10',
        'E101', 'E12', 'E121', 'E17', 'E24', 'E25', 'E31', 'E33', 'E34',
        'E

# Transformador de Columnas

In [197]:
# crear pipeline de preprocesamiento
# imputador simple por mediana
# estandarización estandar

num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
    ])

# hacer pipeline
# imputación simple por mediana
# estandarización estandar
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

# mostrar pipeline
set_config(display='diagram')

# transformar variables numericas 
num_pipeline.fit_transform(data[numeric_columns])

num_pipeline
  

In [207]:
# ver step de Simpleimputer
num_pipeline.named_steps["simpleimputer"] 


In [200]:
# cambiar estrategia en paso
num_pipeline.set_params(simpleimputer__strategy="median")


In [215]:
# hacer pipeline
# imputación simple por moda
# codificación one-hot [0-1] 

cate_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

cate_pipeline

# transformar variables numericas 
cate_pipeline.fit_transform(data[category_columns])
cate_pipeline.steps

cate_pipeline


In [226]:
# crear Pipeline de preprocesamiento
# num_pipeline: variables numericas
# cat_pipeline: variables categoricas
preprocessing = ColumnTransformer([
    ("numericas", num_pipeline, numeric_columns),
    ("categoricas", cate_pipeline, category_columns),
])

# transformar base de datos con Pipeline de preprocesamiento
data_preprocess1 = preprocessing.fit_transform(data)


# crear DataFrame
#data_preprocess1 = pd.DataFrame(
#    data_preprocess1,
#    columns=preprocessing.get_feature_names_out()
#    )


preprocessing.get_feature_names_out()


array(['numericas__PassengerId', 'numericas__Survived',
       'numericas__Pclass', 'numericas__Age', 'numericas__SibSp',
       'numericas__Parch', 'numericas__Fare', 'categoricas__Sex_female',
       'categoricas__Sex_male', 'categoricas__Cabin_A10',
       'categoricas__Cabin_A16', 'categoricas__Cabin_A20',
       'categoricas__Cabin_A23', 'categoricas__Cabin_A24',
       'categoricas__Cabin_A26', 'categoricas__Cabin_A31',
       'categoricas__Cabin_A34', 'categoricas__Cabin_A36',
       'categoricas__Cabin_A5', 'categoricas__Cabin_A6',
       'categoricas__Cabin_A7', 'categoricas__Cabin_B101',
       'categoricas__Cabin_B18', 'categoricas__Cabin_B19',
       'categoricas__Cabin_B20', 'categoricas__Cabin_B22',
       'categoricas__Cabin_B3', 'categoricas__Cabin_B30',
       'categoricas__Cabin_B35', 'categoricas__Cabin_B37',
       'categoricas__Cabin_B38', 'categoricas__Cabin_B39',
       'categoricas__Cabin_B4', 'categoricas__Cabin_B41',
       'categoricas__Cabin_B42', 'categoric

In [212]:
data_preprocess1

<183x145 sparse matrix of type '<class 'numpy.float64'>'
	with 1830 stored elements in Compressed Sparse Row format>

In [None]:
# crear ColumnTransformer
# pipeline1:   num_pipeline - imputacion simple por mediana, escalamiento estandar
# pipeline2: cate_pipeline - imputacion simple por moda(clase más frecuente), codificación one-hot 
preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cate_pipeline, make_column_selector(dtype_include=object)),
)

# transformar base de datos con ColumnTransformer
data_preprocess1 = preprocessing.fit_transform(data)


# extra code – shows that we can get a DataFrame out if we want
df_preprocess1 = pd.DataFrame(
    data_preprocess1,
    columns=preprocessing.get_feature_names_out(),
    index=data.index)
df_preprocess1.head(2)


# NOTA

In [5]:
numeric_features = ["Age", "Fare"]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

## Variables categóricas

In [6]:
categorical_features = ["Pclass", "Sex", "Embarked"]
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

## Variables de texto

In [7]:
text_features = ["Name"]
text_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('tfidf', TfidfVectorizer())
])

# Pipeline de Preprocesamiento

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('text', text_transformer, text_features)
    ])

# Entrenar Pipeline de Preprocesamiento

In [12]:
# Apply preprocessing to the training data
x_train_preprocessed = preprocessor.fit_transform(x_train)


AttributeError: 'numpy.ndarray' object has no attribute 'lower'