# Manipulacion de los datos

In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

from sklearn.datasets import fetch_openml

np.random.seed(42)
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
X.drop(['boat', 'body', 'home.dest'], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.25)

In [38]:
X_train.isnull().sum() / len(X_train) * 100

pclass       0.000000
name         0.000000
sex          0.000000
age         20.285423
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.101937
cabin       78.389399
embarked     0.000000
dtype: float64

In [39]:
X_train.drop(['cabin'], axis=1, inplace=True)
X_test.drop(['cabin'], axis=1, inplace=True)

In [40]:
for dataset in [X_train, X_test]:
    dataset['family_size'] = dataset['parch'] + dataset['sibsp'] + 1
    dataset.drop(columns=['parch', 'sibsp', 'name', 'ticket'], axis=1, inplace=True)
    dataset['is_alone'] = (dataset['family_size'] == 1).astype(int)

In [41]:
X_train.head()

Unnamed: 0,pclass,sex,age,fare,embarked,family_size,is_alone
1216,3,female,,7.7333,Q,1,1
819,3,female,,7.75,Q,1,1
1286,3,female,38.0,7.2292,C,1,1
1280,3,male,22.0,7.8958,S,1,1
761,3,male,16.0,9.5,S,1,1


### Preprocesing data w Pipelines

In [42]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

cat_cols = ['sex', 'pclass', 'embarked', 'is_alone']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_cols = ['age', 'fare', 'family_size']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

preprocessor = ColumnTransformer(
    transformers = [
        ('cat', cat_transformer, cat_cols),
        ('num', num_transformer, num_cols)
    ]
)

clf = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()

np.float64(0.7899513104734279)