In [None]:
import pandas as pd
import seaborn as sns

# Import Pipeline from scikit-learn to create a sequence of data processing steps
from sklearn.pipeline import Pipeline

# Import ColumnTransformer to apply different preprocessing steps to different columns
from sklearn.compose import ColumnTransformer

# Import StandardScaler to standardize numerical features (scale them to mean=0, std=1)
# Import OneHotEncoder to convert categorical variables into a one-hot encoded format
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Import SimpleImputer to handle missing values by imputing them (e.g., with mean, median, or most frequent)
from sklearn.impute import SimpleImputer

# Import LogisticRegression for building a logistic regression model (used for classification tasks)
from sklearn.linear_model import LogisticRegression

In [None]:
def load_data():
    return sns.load_dataset("titanic")

In [None]:
# Build a preprocessing pipeline for numerical and categorical features
def build_pipeline():
    # Define numerical and categorical feature lists
    num_features = ["age", "fare"]
    cat_features = ["sex", "class", "embarked"]

    # Pipeline for numerical features: impute missing values with mean, then scale
    num_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ])

    # Pipeline for categorical features: impute missing values with most frequent, then one-hot encode
    cat_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    # Combine numerical and categorical transformers using ColumnTransformer
    preprocessor = ColumnTransformer([
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ])

    return preprocessor

In [None]:
# Process the DataFrame by dropping columns, splitting features/target, and applying preprocessing
def process_data(df, preprocessor):
    # Drop irrelevant columns that won't be used in the model
    df = df.drop(columns=["deck", "embark_town", "alive"])
    
    # Split features (X) and target (y); 'survived' is the target variable
    X = df.drop("survived", axis=1)
    y = df["survived"].fillna(0)  # Fill missing values in target with 0
    
    # Apply the preprocessor to transform features (e.g., impute, scale, encode)
    X_transformed = preprocessor.fit_transform(X)
    
    return X_transformed, y  # Return transformed features and target

In [None]:
# Fill missing 'square_footage' values with the mean
df["square_footage"].fillna(df["square_footage"].mean(), inplace=True)

# Fill missing 'area' values with the most frequent value (mode)
df["area"].fillna(df["area"].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["age"].fillna(df["age"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["embarked"].fillna(df["embarked"].mode()[0], inplace=True)


In [None]:
def save_data(X, y):
    df_processed = pd.DataFrame(X)
    df_processed["survived"] = y.values
    df_processed.to_csv("titanic_cleaned.csv", index=False)
    print("Data saved successfully!")


In [None]:
df = load_data()
preprocessor = build_pipeline()
X_transformed, y = process_data(df, preprocessor)
save_data(X_transformed, y)

Data saved successfully!


In [None]:
print(preprocessor)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['age', 'fare']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['sex', 'class', 'embarked'])])


In [None]:
# Import pandas for data manipulation
import pandas as pd

# Convert the transformed features (X_transformed) into a DataFrame for inspection
df_transformed = pd.DataFrame(X_transformed)

# Display the first 5 rows of the transformed DataFrame
df_transformed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.592481,-0.502445,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.638789,0.786845,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-0.284663,-0.488854,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.407926,0.42073,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.407926,-0.486337,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [None]:
df_cleaned = pd.read_csv("titanic_cleaned.csv")
df_cleaned.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,survived
0,-0.592481,-0.502445,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0
1,0.638789,0.786845,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
2,-0.284663,-0.488854,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
3,0.407926,0.42073,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
4,0.407926,-0.486337,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0


In [None]:
from sklearn import set_config
set_config(display="diagram")


In [None]:
preprocessor

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
# Create a pipeline combining the preprocessor and LogisticRegression model
pipe = make_pipeline(preprocessor, LogisticRegression())

In [None]:
pipe