### Lander: Ejemplos Pipelines

In [3]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import numpy as np

### Inconsistent preprocessing

In [4]:
random_state = 42
X, y = make_regression(random_state=random_state, n_features=1, noise=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=random_state
    )

model = make_pipeline(
    StandardScaler(), 
    LinearRegression()
    )
model.fit(X_train, y_train)
mean_squared_error(y_test, model.predict(X_test))

0.9027975466369481

### Data Leakage


In [5]:
n_samples, n_features, n_classes = 200, 10000, 2
rng = np.random.RandomState(42)
X = rng.standard_normal((n_samples, n_features))
y = rng.choice(n_classes, n_samples)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)

pipeline = make_pipeline(SelectKBest(k=25),
                         GradientBoostingClassifier(random_state=1))

pipeline.fit(X_train, y_train)


In [6]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.46

### Ejemplo completo


In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# Ejemplo de dataset
data = {
    'Daily Time Spent on Site': [68.95, 80.23, 69.47, 74.15, 68.37],
    'Age': ['35 years', '31 years', '26 years', '29 years', '35 años'],
    'Area Income': [61833.90, 68441.85, 59785.94, 54806.18, 73889.99],
    'Daily Internet Usage': [256.09, 193.77, 236.50, 245.89, 225.58],
    'Ad Topic Line': ['Cloned 5thgeneration orchestration', 'Monitored national standardization',
                      'Organic bottom-line service-desk', 'Triple-buffered reciprocal time-frame',
                      'Robust logistical utilization'],
    'City': ['Wrightburgh', 'West Jodi', 'Davidton', 'West Terrifurt', 'South Manuel'],
    'Male': [0, 1, 0, 1, 0],
    'Country': ['Tunisia', 'Nauru', 'San Marino', 'Italy', 'Iceland'],
    'Timestamp': ['2016-03-27 00:53:11', '2016-04-04 01:39:02', '2016-03-13 20:35:42', 
                  '2016-01-10 02:31:19', '2016-06-03 03:36:18'],
    'Clicked on Ad': [0, 0, 0, 0, 0]
}

df = pd.DataFrame(data)

# Función para limpiar la columna "Age"
def clean_age_column(age_series):
    return age_series.str.replace(r'\D+', '', regex=True).astype(int)

# Crear transformer personalizado
age_cleaner = FunctionTransformer(clean_age_column)

# Crear pipeline
pipeline = Pipeline([
    ('clean_age', age_cleaner)
])

# Aplicar el pipeline a la columna "Age"
df['Age'] = pipeline.fit_transform(df[['Age']])

# Mostrar el resultado
print(df)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE

# Ejemplo de dataset
data = {
    'Age': [25, 32, 47, 51, None],
    'Income': [50000, 64000, 120000, 75000, 82000],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male'],
    'Clicked on Ad': [0, 1, 0, 1, 0]
}
df = pd.DataFrame(data)

# Separar características y target
X = df.drop('Clicked on Ad', axis=1)
y = df['Clicked on Ad']

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocesamiento para variables numéricas
numeric_features = ['Age', 'Income']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocesamiento para variables categóricas
categorical_features = ['Gender']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Crear un ColumnTransformer para aplicar diferentes transformaciones a las columnas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Definir el pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(f_classif, k=2)),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))  # Modelo a ser definido en GridSearchCV
])

# Definir el espacio de búsqueda de hiperparámetros
param_grid = [
    {
        'classifier': [LogisticRegression(max_iter=1000, random_state=42)],
        'classifier__C': [0.01, 0.1, 1, 10],               # Hiperparámetros para LogisticRegression
    },
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [50, 100, 200],        # Hiperparámetros para RandomForest
        'classifier__max_depth': [None, 10, 20],
    },
    {
        'classifier': [SVC(random_state=42)],
        'classifier__C': [0.1, 1, 10],                    # Hiperparámetros para SVC
        'classifier__kernel': ['linear', 'rbf'],
    }
]

# Usar GridSearchCV con validación cruzada
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Entrenar el modelo con la búsqueda de hiperparámetros
grid_search.fit(X_train, y_train)

# Mostrar el mejor modelo y sus hiperparámetros
print("Mejor modelo encontrado:")
print(grid_search.best_estimator_)

# Evaluar el mejor modelo en el conjunto de prueba
accuracy = grid_search.score(X_test, y_test)
print("Accuracy en el conjunto de prueba:", accuracy)
