# Instalación de dependencias

In [None]:
import pickle
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Descarga de datos

In [None]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [None]:
X.head()

In [None]:
y.head()

# Preprocesamiento (debe estar dentro del pipeline) 

In [None]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer()),
    ("ss", StandardScaler())
])

ct = ColumnTransformer([
    ("cat", OneHotEncoder(), ["pclass", "sex", "embarked"]),
    ("num", num_pipeline, ["age","fare"])
])

# Creación del pipeline

In [None]:
pipeline = Pipeline([
    ("ct", ct),
    ("model", DecisionTreeClassifier())
])

pipeline

# División train/test 

In [None]:
from random import randint
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenamiento 

In [None]:
pipeline.fit(X, y)

# Evaluación

In [None]:
from sklearn.metrics import accuracy_score

y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

In [None]:
# Acc. en train
accuracy_score(y_train, y_pred_train)

In [None]:
# Acc. en test
accuracy_score(y_test, y_pred_test)

# Serialización 

Guardar el modelo en un fichero (pickle)

In [None]:
with open("titanic_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)