In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (8, 4)

df = pd.read_csv('data/raw/titanic.csv')
df.head()

# Preprocesado y Modelos

#En este notebook realizamos el preprocesamiento del dataset del Titanic y probamos diferentes modelos de Machine Learning para predecir la supervivencia. Este paso forma parte del pipeline del proyecto.



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_features = ["Age", "SibSp", "Parch", "Fare"]
cat_features = ["Sex", "Embarked"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ]
)

X = df.drop("Survived", axis=1)
y = df["Survived"]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model_lr = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])

model_lr.fit(X, y)
y_pred_lr = model_lr.predict(X)
accuracy_lr = accuracy_score(y, y_pred_lr)
accuracy_lr


In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier())
])

model_rf.fit(X, y)
y_pred_rf = model_rf.predict(X)
accuracy_rf = accuracy_score(y, y_pred_rf)
accuracy_rf


In [None]:
## Conclusión

#En este notebook se han probado varios modelos.  
#El rendimiento obtenido fue:

#- Logistic Regression: **{accuracy_lr}**
#- Random Forest: **{accuracy_rf}**

#En el futuro se podría mejorar mediante tuning de hiperparámetros o modelos más avanzados.
