## Titanic: preparación rápida

Pipeline mínimo sobre Titanic de Kaggle.

**Librerías**: pandas, scikit-learn.

In [3]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


### Paso 1: Cargar datos

Descarga el dataset en: https://www.kaggle.com/datasets/brendan45774/test-file

In [4]:
# Usa el archivo de Kaggle (train_and_test2.csv). Cambia la ruta si es necesario.
# Ej: df = pd.read_csv('/path/to/train.csv')
df = pd.read_csv('/content/tested.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Paso 2: Objetivo y división train/test (estratificada)

In [5]:
y = df['Survived']
X = df.drop(columns=['Survived'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape


((334, 11), (84, 11))

### Paso 3: Columnas numéricas y categóricas

In [6]:

num_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()
num_cols, cat_cols


(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'],
 ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'])

### Paso 4: Imputación explícita (ejemplo)

In [7]:
X.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


In [9]:

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Ajuste en train
Xn_train = pd.DataFrame(num_imputer.fit_transform(X_train[num_cols]), columns=num_cols, index=X_train.index)
Xc_train = pd.DataFrame(cat_imputer.fit_transform(X_train[cat_cols]), columns=cat_cols, index=X_train.index)

# Transform en test
Xn_test  = pd.DataFrame(num_imputer.transform(X_test[num_cols]), columns=num_cols, index=X_test.index)
Xc_test  = pd.DataFrame(cat_imputer.transform(X_test[cat_cols]), columns=cat_cols, index=X_test.index)

# Combinar
Xi_train = pd.concat([Xn_train, Xc_train], axis=1)
Xi_test  = pd.concat([Xn_test,  Xc_test],  axis=1)

Xi_train.isna().mean().sort_values(ascending=False).head(5), Xi_test.isna().mean().max()


(PassengerId    0.0
 Pclass         0.0
 Age            0.0
 SibSp          0.0
 Parch          0.0
 dtype: float64,
 0.0)

### Paso 5: Estadísticos aprendidos

In [11]:

# Mapeos clave de imputación (muestras)
num_stats = pd.Series(num_imputer.statistics_, index=num_cols).head(8)
cat_stats = pd.Series(cat_imputer.statistics_, index=cat_cols).head(8)
num_stats, cat_stats


(PassengerId    1093.500
 Pclass            3.000
 Age              27.000
 SibSp             0.000
 Parch             0.000
 Fare             13.775
 dtype: float64,
 Name        Abbott, Master. Eugene Joseph
 Sex                                  male
 Ticket                             113503
 Cabin                                 A34
 Embarked                                S
 dtype: object)

### Paso 6: (Opcional) Preprocesador para modelar

In [12]:

pre_num = Pipeline([('imp', SimpleImputer(strategy='median')), ('sc', StandardScaler())])
pre_cat = Pipeline([('imp', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore'))])
pre = ColumnTransformer([('num', pre_num, num_cols), ('cat', pre_cat, cat_cols)])

Xt_train = pre.fit_transform(X_train)
Xt_test  = pre.transform(X_test)
Xt_train.shape, Xt_test.shape


((334, 712), (84, 712))