# Manipulacion de los datos

In [58]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

from sklearn.datasets import fetch_openml

np.random.seed(42)
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
X.drop(['boat', 'body', 'home.dest'], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.25)

In [59]:
X_train.isnull().sum() / len(X_train) * 100

pclass       0.000000
name         0.000000
sex          0.000000
age         20.285423
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.101937
cabin       78.389399
embarked     0.000000
dtype: float64

In [60]:
X_train.drop(['cabin'], axis=1, inplace=True)
X_test.drop(['cabin'], axis=1, inplace=True)

In [61]:
for dataset in [X_train, X_test]:
    dataset['family_size'] = dataset['parch'] + dataset['sibsp'] + 1
    dataset.drop(columns=['parch', 'sibsp', 'name', 'ticket'], axis=1, inplace=True)
    dataset['is_alone'] = (dataset['family_size'] == 1).astype(int)

y_train = y_train.astype(int)
y_test  = y_test.astype(int)

In [62]:
X_train.head()

Unnamed: 0,pclass,sex,age,fare,embarked,family_size,is_alone
1216,3,female,,7.7333,Q,1,1
819,3,female,,7.75,Q,1,1
1286,3,female,38.0,7.2292,C,1,1
1280,3,male,22.0,7.8958,S,1,1
761,3,male,16.0,9.5,S,1,1


### Preprocesing data w Pipelines

In [63]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

cat_cols = ['sex', 'pclass', 'embarked', 'is_alone']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_cols = ['age', 'fare', 'family_size']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

preprocessor = ColumnTransformer(
    transformers = [
        ('cat', cat_transformer, cat_cols),
        ('num', num_transformer, num_cols)
    ]
)

clf = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()

np.float64(0.7899513104734279)

### Can the model improve?

In [67]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

grid_params = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__solver': ['lbfgs', 'saga'],
    'classifier__penalty': ['l2'],
}

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=grid_params,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Entrenamiento de Grid Search
grid_search.fit(X_train, y_train)

# Resultados de parámetros
best_params = grid_search.best_params_
print("Mejores parámetros:")
for param_name in sorted(best_params.keys()):
    print(f" {param_name}: {best_params[param_name]}")

# Predicciones en validación y métricas
predictions = grid_search.predict(X_test)
print(classification_report(y_test, predictions))


Fitting 5 folds for each of 8 candidates, totalling 40 fits
Mejores parámetros:
 classifier__C: 1
 classifier__penalty: l2
 classifier__solver: lbfgs
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       203
           1       0.76      0.74      0.75       125

    accuracy                           0.81       328
   macro avg       0.80      0.80      0.80       328
weighted avg       0.81      0.81      0.81       328

