# Preprocessing Pipelines 

# Librerias

In [94]:
# general
import numpy as np
import pandas as pd
import re
import os
import timeit


# visualización
import seaborn as sns
import matplotlib.pyplot as plt

from warnings import warn
plt.style.use('seaborn-colorblind')
#from data_exploration import explore


# preprocesamiento
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif,chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile

# ML
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import scipy.stats as stats
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error
import pylab
from sklearn.tree import DecisionTreeRegressor

# ML pipelines
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer

from sklearn import set_config

  plt.style.use('seaborn-colorblind')


# Cargar Bases de datos (Tablas)

In [3]:
# cargar base de datos
data = pd.read_csv('./data/titanic.csv')
data.head(3)
print(data.shape)
data

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Separar y dependiente, x variables

In [4]:
x = data.drop("Survived", axis=1)
y = data["Survived"]
x,y

(     PassengerId  Pclass                                               Name  \
 0              1       3                            Braund, Mr. Owen Harris   
 1              2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
 2              3       3                             Heikkinen, Miss. Laina   
 3              4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
 4              5       3                           Allen, Mr. William Henry   
 ..           ...     ...                                                ...   
 886          887       2                              Montvila, Rev. Juozas   
 887          888       1                       Graham, Miss. Margaret Edith   
 888          889       3           Johnston, Miss. Catherine Helen "Carrie"   
 889          890       1                              Behr, Mr. Karl Howell   
 890          891       3                                Dooley, Mr. Patrick   
 
         Sex   Age  SibSp  Parch      

# Separar tabla(bases de datos) en datos de entrenamiento y datos de prueba

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# Transformación de variables

Definir pasos de preprocesamiento

# Extraer tipos de variables

In [83]:
# extraer nombres de columnas por tipo 
# retorna tupla de listas con nombres de columnas por tipo
def columns_types(data):
    
    # definir estrucutra de datos por tipo de variable
    numeric_types=['int', 'int32', 'int64', 'float', 'float32', 'float64']
    str_types = ['object', 'string']
    bool_types = ['bool']
    date_types = ['datetime64', 'datetime', 'datetime64[ns]']
    category_types = ['category']
    
    # listas de nombres de variables
    numeric_columns = []
    str_columns = []
    bool_columns = []
    date_columns = []
    category_columns = []

    for column in data.columns:
        
        col_type = str(data[column].dtype)
        
        if any(col_type.startswith(t) for t in numeric_types) or any(col_type.startswith(t) for t in numeric_types):
            numeric_columns.append(column)
        elif any(col_type.startswith(t) for t in str_types):
            str_columns.append(column)
        elif any(col_type.startswith(t) for t in bool_types):
            bool_columns.append(column)
        elif any(col_type.startswith(t) for t in date_types):
            date_columns.append(column)
        elif any(col_type.startswith(t) for t in category_types):
            category_columns.append(column)
            

    return numeric_columns, category_columns, str_columns,bool_columns, date_columns




In [84]:
# extraer nombres de columnas por tipo 
numeric_columns, category_columns, str_columns,bool_columns, date_columns = columns_types(data)

# Pipelines de preprocesamiento 

In [85]:
#data
str_columns

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [86]:
# quitar columna 'Sex'
str_columns.remove('Sex')
str_columns

['Name', 'Ticket', 'Cabin', 'Embarked']

## Pipeline de preprocesamiento de variables numéricas 

In [87]:
# crear pipeline de variables numericas
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="mean")),
    ("standardize", StandardScaler()),
])

# crear pipeline de preprocesamiento de variables numericas
# pasos: imputacion simple por mediana, escalamiento estandar
num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
num_pipeline

# ver pipeline
set_config(display='diagram')

# aplicar pipeline de preprocesamiento a variables numericas
numeric_preprocess1 = num_pipeline.fit_transform(data[numeric_columns])

# convertir columnas numericas preprocesadas a DataFrame (tabla)
df_numeric_preprocess = pd.DataFrame(numeric_preprocess1, columns=num_pipeline.get_feature_names_out())
df_numeric_preprocess

# ver pasos de preprocesamiento de pipeline de variables numericas
num_pipeline.steps
num_pipeline

In [88]:
# ver paso de pipeline
num_pipeline.named_steps["simpleimputer"]

# cambiar parametros de paso de pipeline
num_pipeline.set_params(simpleimputer__strategy="median")



## Pipeline de preprocesamiento de variables categóricas

In [89]:
category_columns.append('Sex')

In [95]:
# crear pipeline de preprocesamiento de variables categoricas
# pasos: imputacion simple por moda(clase más frecuente), codificación one-hot 
cate_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


# crear ColumnTransformer con pipelines de preprocesamiento de variables
# variables numericas: num_pipeline
# variables categoricas: cate_pipeline
preprocessing = ColumnTransformer([
    ("num", num_pipeline, numeric_columns),
    ("cat", cate_pipeline, category_columns),
])





## Transformador de Columnas

In [None]:
# crear ColumnTransformer
# pipeline1:   num_pipeline - imputacion simple por mediana, escalamiento estandar
# pipeline2: cate_pipeline - imputacion simple por moda(clase más frecuente), codificación one-hot 
preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cate_pipeline, make_column_selector(dtype_include=object)),
)

# transformar base de datos con ColumnTransformer
data_preprocess1 = preprocessing.fit_transform(data)


# extra code – shows that we can get a DataFrame out if we want
df_preprocess1 = pd.DataFrame(
    data_preprocess1,
    columns=preprocessing.get_feature_names_out(),
    index=data.index)
df_data_preprocess1.head(2)


# NOTA

In [5]:
numeric_features = ["Age", "Fare"]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

## Variables categóricas

In [6]:
categorical_features = ["Pclass", "Sex", "Embarked"]
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

## Variables de texto

In [7]:
text_features = ["Name"]
text_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('tfidf', TfidfVectorizer())
])

# Pipeline de Preprocesamiento

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('text', text_transformer, text_features)
    ])

# Entrenar Pipeline de Preprocesamiento

In [12]:
# Apply preprocessing to the training data
x_train_preprocessed = preprocessor.fit_transform(x_train)


AttributeError: 'numpy.ndarray' object has no attribute 'lower'