In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Load a different dataset with the 'Embarked' column
titanic_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic = pd.read_csv(titanic_url)

In [3]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
# Introduce some missing values
np.random.seed(42)
titanic['Age']      = np.where(np.random.rand(len(titanic)) < 0.2, np.nan, titanic['Age'])
titanic['Embarked'] = np.where(np.random.rand(len(titanic)) < 0.1, np.nan, titanic['Embarked'])

In [5]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,


In [6]:
# Define numeric and categorical features
numeric_features     = ['Age',      'Fare']
categorical_features = ['Embarked', 'Pclass']

In [7]:
# Create transformers for numeric and categorical imputation
numeric_transformer     = SimpleImputer(strategy='mean')
categorical_transformer = SimpleImputer(strategy='most_frequent')

In [23]:
# Create a column transformer to apply different imputation strategies to numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ])

In [24]:
# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [25]:
# Apply the pipeline to impute missing values
titanic_imputed = pipeline.fit_transform(titanic)

In [26]:
# Convert the imputed data back to a DataFrame for display
titanic_imputed_df = pd.DataFrame(titanic_imputed, columns=numeric_features + categorical_features)

In [27]:
p

Unnamed: 0,Age,Fare,Embarked,Pclass
0,22.0,7.25,S,3
1,38.0,71.2833,C,1
2,26.0,7.925,S,3
3,35.0,53.1,S,1
4,29.5044,8.05,S,3
...,...,...,...,...
886,27.0,13.0,S,2
887,19.0,30.0,S,1
888,29.5044,23.45,S,3
889,26.0,30.0,S,1
