# Manipulacion de los datos

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data = pd.read_csv('titanic-dataset.csv')


### Preprocesamiento y EDA

In [2]:
data.isna().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool

In [3]:
df_clean = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1).copy()

In [4]:
# Cleaning NA Values
df_clean['FamilySize'] = df_clean['SibSp'] + df_clean['Parch'] + 1
df_clean['IsAlone'] = (df_clean['FamilySize'] == 1).astype(int)
df_clean.drop(columns=['SibSp', 'Parch'], inplace=True)


In [5]:
df_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize,IsAlone
0,0,3,male,34.5,7.8292,Q,1,1
1,1,3,female,47.0,7.0,S,2,0
2,0,2,male,62.0,9.6875,Q,1,1
3,0,3,male,27.0,8.6625,S,1,1
4,1,3,female,22.0,12.2875,S,3,0


In [9]:
from sklearn.model_selection import train_test_split

X = df_clean.drop(columns='Survived', axis=1)
y = df_clean['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


### Pipelines preprocessing

In [10]:
num_cols = ['Pclass', 'Age', 'Fare', 'FamilySize', 'IsAlone']
cat_cols = ['Sex', 'Embarked']

In [17]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

num_pipe = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline(steps = [
    ('ordinal-encoder', OrdinalEncoder())
])

In [19]:
from sklearn.compose import ColumnTransformer

col_transformer = ColumnTransformer(transformers = [
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
],
    remainder='drop',
    n_jobs=-1
)

In [20]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression