In [1]:
import pickle
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


#generador de pipelines
def make_pipeline(use_pca: bool =False, components: int =0, num_var: list=[], cat_var: list=[]):
    
    if use_pca:
        num_pipeline = Pipeline([
            ("imputer", SimpleImputer()),
            ("ss", StandardScaler()),
            ("pca", PCA(n_components=components))
        ])
    else:
        num_pipeline = Pipeline([
            ("imputer", SimpleImputer()),
            ("ss", StandardScaler())
        ])

    ct = ColumnTransformer([
        ("cat", OneHotEncoder(), cat_var),
        ("num", num_pipeline, num_var)
    ])

    pipeline = Pipeline([
        ("ct", ct)
    ])

    return pipeline

def fit(pipeline: Pipeline, X, y):
    pipeline = pipeline.fit(X, y)
    return pipeline

def export(pipeline: Pipeline, file: str):
    with open(file, "wb") as f:
        pickle.dump(pipeline, f)


# Crear el pipeline
generated_pipeline = make_pipeline(use_pca=True, components=0, num_var=["age", "fare", "sibsp", "parch"], cat_var=["pclass", "sex", "embarked"])


# carga del dataset
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# preprocesamiento
X.rename(columns={
    "home.dest": "homedest"
}, inplace=True)

# division train_test
seed = 1234
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Entrenamiento 
generated_pipeline = fit(generated_pipeline, X_train, y_train)

# exportar pipeline
export(generated_pipeline, "generated_pipeline.pkl")

In [32]:
# ejemplo de 
pipeline = Pipeline([
        ("generated_pipeline", generated_pipeline),
        ("model",DecisionTreeClassifier())
    ])

pipeline.fit(X_test, y_test)

pipeline.predict(X_test.head())

array(['0', '0', '0', '0', '0'], dtype=object)