Objetivo: Enseñar a los estudiantes a manejar valores faltantes y transformar datos categóricos y numéricos usando Scikit-learn.



Instrucciones

Carga del Dataset:
Utilizar el dataset Titanic de Kaggle o OpenML.
Tareas:
Manejar valores faltantes en las columnas Age, Embarked, y Fare.
Codificar las variables categóricas Sex y Embarked utilizando One-Hot Encoding.
Escalar las características numéricas Age y Fare.

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

In [None]:
# Usamos la biblioteca google para poder usar archivos en nuestro drive.
from google.colab import drive
# Este comando conecta colab con drive.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path ="/content/drive/MyDrive/CLASE MACHINE LEARNING/DATA/Titanic-Dataset.csv"
df = pd.read_csv(path)

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
df.columns = df.columns.str.lower().str.strip()

In [None]:
df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [None]:
# se verifica datos duplicados
df.duplicated().sum()

np.int64(0)

In [None]:
df[df["age"].isna()]

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [None]:
df[["sex","cabin"]]

Unnamed: 0,sex,cabin
0,male,
1,female,C85
2,female,
3,female,C123
4,male,
...,...,...
886,male,
887,female,B42
888,female,
889,male,C148


In [None]:
# Se maneja los datos faltantes en 'cabin', se saca la moda de cada sexo y se aplica a columna 'cabin'
mode_cabin_m = df[df["sex"] == 'male']["cabin"].mode()[0]
mode_cabin_f = df[df["sex"]== 'female']["cabin"].mode()[0]
df.loc[(df["sex"] == 'male') & (df["cabin"].isna()), 'cabin'] = mode_cabin_m
df.loc[(df["sex"] == 'female') & (df["cabin"].isna()), 'cabin'] = mode_cabin_f


In [None]:
df[df["cabin"].isna()]

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked


In [None]:
# Se verifica si hay nulos en columna 'embarked'
df[df["embarked"].isna()]

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [None]:
# Se aplica la moda de columna 'embarked' a los 2 valores nulos
mode_embarked = df["embarked"].mode()[0]
df["embarked"] = df["embarked"].fillna(mode_embarked)

In [None]:
df[df["cabin"] == 'B28']

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,S
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,S


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        891 non-null    object 
 11  embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### Para la columna 'age' y 'fare' se manejara los nulos imputando con SimpleImputer

In [62]:
X = df.drop(columns="survived")
y = df["survived"]

In [63]:
# Se separan las variables.
cols_num = ['age', 'fare']
cols_cat = ['sex', 'pclass', 'embarked']

In [64]:
# Se divide los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
# Se definen transformadores para cada subconjunto de columnas
num_transformer = Pipeline(steps=[
         ('imputer', SimpleImputer(strategy='mean')),
         ('scaler',  StandardScaler())
        ])
cat_transformer = OneHotEncoder(handle_unknown='ignore')


In [67]:
# Se crea el ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
           ('num', num_transformer, cols_num),
           ('cat', cat_transformer, cols_cat)
           ])

In [70]:
#  Se crea el Pipeline completo
pipeline = Pipeline(steps=[
         ('preprocessor', preprocessor),
         ('Classifier', DecisionTreeClassifier())
         ])

In [71]:
# Se ajusta el pipeline a los datos de entrenamiento
pipeline.fit(X_train, y_train)

In [72]:
# Se ahce las predicciones usando el pipeline
y_pred = pipeline.predict(X_test)

In [73]:
# Se calcula la precisión del modelos
accuracy = accuracy_score(y_test, y_pred)
print(f"Preción del Modelo : {accuracy:.2f}")

Preción del Modelo : 0.78


#### Se obtiene un 0.78 de precisión imputando las variables numericas 'age' y 'fare' con SimpleImputer y las categoricas con oneHotEncoder