In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings("ignore")

from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OrdinalEncoder
from feature_engine.encoding import RareLabelEncoder
from feature_engine.selection import DropFeatures

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Built-in library
import itertools
import re
import json
import logging
import typing as tp


# pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Black code formatter (Optional)
%load_ext lab_black
# auto reload imports
%load_ext autoreload
%autoreload 2

In [None]:
# Helper Functions
def set_up_logger(delim: str = "::") -> tp.Any:
    """This is used to create a basic logger."""
    format_ = f"[%(levelname)s] {delim} %(asctime)s {delim} %(message)s"
    logging.basicConfig(level=logging.INFO, format=format_)
    logger = logging.getLogger(__name__)
    return logger


def load_data(*, filepath: str) -> pd.DataFrame:
    """This is used to load data as a dataframe.

    Params:
        filepath (str): The filepath of the input data.

    Returns:
        df (pd.Dataframe): A DF containing the input data.
    """
    df = pd.read_csv(filepath)
    logger.info(f"Shape of df: {df.shape}\n")
    return df

In [None]:
logger = set_up_logger()

# Load data
df = load_data(filepath="../data/titanic_train.csv")

df.head()

In [None]:
from src.data_manager import CastVariables

cast_feats = CastVariables(features=["Pclass", "SibSp", "Parch"])
cast_feats.fit_transform(df)

In [None]:
# Split the data into train and test sets
target = "Survived"
test_size = 0.1
random_state = 123


X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)

X_train.shape, X_test.shape

In [None]:
# Split the data into numerical and categorical variables
num_vars = [*X_train.select_dtypes(exclude="O").columns]
cat_vars = [*X_train.select_dtypes(exclude=["int", "float"]).columns]

num_vars, cat_vars

### Numerical Variables

In [None]:
# Check for missing values
num_vars_wf_na = [var for var in num_vars if X_train[var].isna().sum() > 0]

# Percentage of missing values
X_train[num_vars_wf_na].isna().mean().mul(100)

In [None]:
X_train["Age"].plot(kind="hist")
plt.title("Distribution of Age")

plt.tight_layout()
plt.show()

In [None]:
X_train[num_vars].describe()

In [None]:
# Replace the missing values in numerical variables
mean_imputer = MeanMedianImputer(imputation_method="median", variables=num_vars_wf_na)
X_train = mean_imputer.fit_transform(X_train)
X_test = mean_imputer.transform(X_test)

# Verify
X_train[num_vars_wf_na].isna().mean().mul(100), X_test[
    num_vars_wf_na
].isna().mean().mul(100)

In [None]:
# Check Cardinality
for var in num_vars:
    uniq_vals = X_train[var].nunique()
    print(f"{var}: {uniq_vals} unique values")

In [None]:
# Determine the discrete variables
thresh = 20
discrete_vars = [var for var in num_vars if X_train[var].nunique() < thresh]

num_vars = [*set(num_vars).difference(set(discrete_vars))]

num_vars, discrete_vars

In [None]:
# Cast to categorical
X_train[discrete_vars] = X_train[discrete_vars].astype(str)
X_test[discrete_vars] = X_test[discrete_vars].astype(str)


# Check for `rare` labels
for var in discrete_vars:
    labels = X_train[var].value_counts(normalize=True).mul(100)
    print(f"{var}: \n{labels} unique values\n")

In [None]:
# Encode Rare Labels
rare_label_enc = RareLabelEncoder(tol=0.05, n_categories=5, variables=discrete_vars)
X_train = rare_label_enc.fit_transform(X_train)
X_test = rare_label_enc.transform(X_test)

# Check for `rare` labels
result = {}
for var in discrete_vars:
    labels = X_train[var].value_counts(normalize=True).mul(100)
    result[var] = labels.to_dict()

result

In [None]:
result = {}
for var in discrete_vars:
    labels = X_test[var].value_counts(normalize=True).mul(100)
    result[var] = labels.to_dict()

result

In [None]:
# Scale the variables
yea_johnson_transf = YeoJohnsonTransformer(variables=num_vars)
X_train = yea_johnson_transf.fit_transform(X_train)
X_test = yea_johnson_transf.transform(X_test)

X_train.head(3)

### Categorical Variables

In [None]:
# Check for missing values
cat_vars_wf_na = [var for var in cat_vars if X_train[var].isna().sum() > 0]

# Percentage of missing values
X_train[cat_vars_wf_na].isna().mean().mul(100)

In [None]:
num_vars = ["Age", "Fare"]  # Update vars

# Drop variable(s)
features_to_drop = ["Cabin", "Name", "Ticket", "PassengerId"]
drop_feats = DropFeatures(features_to_drop=features_to_drop)

X_train = drop_feats.fit_transform(X_train)
X_test = drop_feats.transform(X_test)

X_train.columns, X_test.columns

In [None]:
cat_vars

In [None]:
cat_vars = [*set(cat_vars).difference(set(features_to_drop))]

cat_vars, features_to_drop

In [None]:
# Replace the missing values in categorical variables
cat_vars_wf_na = ["Embarked"]
cat_imputer = CategoricalImputer(imputation_method="frequent", variables=cat_vars_wf_na)
X_train = cat_imputer.fit_transform(X_train)
X_test = cat_imputer.transform(X_test)

# Verify
X_train[cat_vars_wf_na].isna().mean().mul(100), X_test[
    cat_vars_wf_na
].isna().mean().mul(100)

In [None]:
X_train[cat_vars + discrete_vars].info()

cat_vars

In [None]:
# Encode Categorical Variables
cat_enc = OrdinalEncoder(encoding_method="ordered", variables=cat_vars + discrete_vars)
X_train = cat_enc.fit_transform(X_train, y_train)
X_test = cat_enc.transform(X_test)

# Verify
X_train[num_vars + cat_vars + discrete_vars].head(3)

In [None]:
# Scale the  variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Verify
X_train[:5], X_test[:5]