# Manipulacion de los datos

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data = pd.read_csv('titanic-dataset.csv')


### Preprocesamiento y EDA

In [5]:
data.isna().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool

In [7]:
df_clean = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1).copy()

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Cleaning NA Values
df_clean['FamilySize'] = df_clean['SibSp'] + df_clean['Parch'] + 1
df_clean['IsAlone'] = (df_clean['FamilySize'] == 1).astype(int)
df_clean.drop(columns=['SibSp', 'Parch'], inplace=True)


In [9]:
df_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize,IsAlone
0,0,3,male,34.5,7.8292,Q,1,1
1,1,3,female,47.0,7.0,S,2,0
2,0,2,male,62.0,9.6875,Q,1,1
3,0,3,male,27.0,8.6625,S,1,1
4,1,3,female,22.0,12.2875,S,3,0


In [13]:
X = df_clean.drop(columns='Survived', axis=1)
y = df_clean['Survived']

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

### Pipeline de preprocesamiento en código

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Define the categoric and numeric columns
numeric_cols = ['Age', 'Fare', 'FamilySize', 'IsAlone']
categoric_cols = ['Sex', 'Embarked']

# Pipeline para columnas numéricas
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),     # Rellena NaN con la media
    ('scaler', StandardScaler())                     # Escala los datos
])

# Pipeline para columnas categóricas
categoric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Rellena NaN con la moda
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))  # Codifica categóricas
])

# ColumnTransformer que aplica cada pipeline
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categoric_pipeline, categoric_cols)
])