In [None]:
import pandas as pd
import os
from pprint import PrettyPrinter

In [None]:
os.getcwd()

In [None]:
pp = PrettyPrinter()

# Primary data sample

In [None]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """Apply manual feature engineering transformations.

    Args:
        df (pd.DataFrame): data frame with raw features

    Returns:
        pd.DataFrame: data frame after feature engineering
    """
    # logger.info("Applying manual feature engineering transformations...")

    df["c_weekday"] = pd.to_datetime(df["i_visit_start_time"], unit="us").dt.weekday
    df["c_visit_start_hour"] = pd.to_datetime(
        df["i_visit_start_time"], unit="us"
    ).dt.hour

    return df

In [None]:
df = pd.read_csv('../data/03_primary/df_train.csv')

In [None]:
dfs = df.sample(20, random_state=22)
dfs

## Engineer features

In [None]:
rows_to_select = list(range(3))
rows_to_select

In [None]:
columns_to_select = [
    "i_full_visitor_id",
    "i_visit_start_time"
]
columns_to_select

In [None]:
df_sample = dfs.iloc[rows_to_select, :][columns_to_select]
df_sample

In [None]:
dict_sample = df_sample.to_dict(orient="list")

In [None]:
pp.pprint(dict_sample)

In [None]:
df_processed = engineer_features(df_sample)
df_processed

In [None]:
dict_processed = df_processed.to_dict(orient="list")

In [None]:
pp.pprint(dict_processed)

In [None]:
df_expected = pd.DataFrame({
        'c_visit_start_hour': [0, 7, 11],
        'c_weekday': [1, 3, 1],
        'i_full_visitor_id': [3033910.355860057, 81793309.0616803, 4293031.296243032],
        'i_visit_start_time': [1611619614341157.0,
                                1611820704267587.0,
                                1611661585573344.0]
    })

In [None]:
df_processed

In [None]:
df_expected

In [None]:
df_processed.equals(df_expected)

In [None]:
df_processed.sort_index(axis=1).equals(df_expected.sort_index(axis=1))

## Data preparation utils

In [None]:
df = pd.DataFrame(
    {
        "c_first_good_colname": ["cat1", "cat2", "cat1"],
        "n_second_good_colname": [1, 3, 5],
        "c_first_bad_colname": [7, 8, 9],
        "n_second_bad_colname": ["3.0", "2.2", "5.6"],
    }
)

In [None]:
df

In [None]:
df.dtypes

In [None]:
from ga4_mlops.pipelines.data_preparation_utils import extract_column_names, ensure_column_types

In [None]:
_, num_cols, cat_cols, _ = extract_column_names(df)

In [None]:
df2 = ensure_column_types(df, num_cols, cat_cols)

In [None]:
df2

In [None]:
df2.dtypes

In [None]:
list(df2.dtypes)

In [None]:
df2.dtypes.to_list()

In [None]:
num_cols = ["n_second_good_colname", "n_second_bad_colname"]
cat_cols = ["c_first_good_colname", "c_first_bad_colname"]
df3 = ensure_column_types(df, num_cols, cat_cols)

In [None]:
df3

In [None]:
from numpy import dtype

In [None]:
expected_types = [dtype('O'), dtype('float64'), dtype('O'), dtype('float64')]

In [None]:
expected_types

In [None]:
df3.dtypes.to_list()

In [None]:
df3["n_second_good_colname"].dtype, df3["c_first_good_colname"].dtype

## Imputation

In [None]:
import pandas as pd
import numpy as np

In [None]:
imputation_sample = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "n_num_col_complete": [4, 2, 6, 1, 0],
        "n_num_col_missing_not_listed": [4.0, np.nan, 2.1, 3.2, np.nan],
        "n_num_col_missing_for_mean": [2, np.nan, 5, np.nan, 8],
        "n_num_col_missing_for_zero": [np.nan, 9.3, -1.2, np.nan, np.nan],
        "c_cat_col_for_mostfreq": ["cat1", np.nan, "cat2", "cat1", "cat1"],
        "c_cat_col_for_unknown": ["cat1", "cat2", np.nan, "cat3", "cat2"],
    }
)

In [None]:
imputation_sample

In [None]:
imputation_sample.iloc[1, 3]

In [None]:
imputation_sample.iloc[0, 4]

In [None]:
from ga4_mlops.pipelines.feature_engineering.nodes import fit_imputers, apply_imputers

In [None]:
expected_df = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "n_num_col_complete": [4, 2, 6, 1, 0],
        "n_num_col_missing_not_listed": [4.0, np.nan, 2.1, 3.2, np.nan],
        "n_num_col_missing_for_mean": [2., 5., 5., 5., 8.],
        "n_num_col_missing_for_zero": [0.0, 9.3, -1.2, 0.0, 0.0],
        "c_cat_col_for_mostfreq": ["cat1", "cat1", "cat2", "cat1", "cat1"],
        "c_cat_col_for_unknown": ["cat1", "cat2", "UNKNOWN", "cat3", "cat2"],
    }
)

In [None]:
expected_df

In [None]:
imputation_strategies = {
  "mean": ["n_num_col_missing_for_mean"],   # for numerical: replace NULLs with column mean
  "zero": ["n_num_col_missing_for_zero"],    # for numerical: replace NULLs with zeros
  "mostfreq": ["c_cat_col_for_mostfreq"],    # for categorical: replace NULLs with most frequent value
  "unknown": ["c_cat_col_for_unknown"],    # for categorical: replace NULLs with UNKNOWN token
}

In [None]:
imputers = fit_imputers(imputation_sample, imputation_strategies)

In [None]:
imputers

In [None]:
df = apply_imputers(imputation_sample, imputers)

In [None]:
df

In [None]:
df.dtypes

In [None]:
expected_df.dtypes

In [None]:
df.equals(expected_df)

## Encoding

In [None]:
import pandas as pd

In [None]:
encoding_sample = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "c_cat_col_for_binary": ["0", "0", "1", "0", "1"],
        "c_cat_col_for_onehot": ["cat1", "cat2", "cat3", "cat3", "cat2"],
        "c_cat_col_for_ordinal": ["cat3", "cat2", "cat1", "cat3", "cat2"],
    }
)

In [None]:
encoding_sample

In [None]:
encoder_types = {
    "binary": ["c_cat_col_for_binary"],   # for binary variables
    "onehot": ["c_cat_col_for_onehot"],   # one-hot encoding
    "ordinal": ["c_cat_col_for_ordinal"]   # integer encoding (ordinal, but order does not matter)
}

In [None]:
from ga4_mlops.pipelines.feature_engineering.nodes import fit_encoders, apply_encoders

In [None]:
encoders = fit_encoders(encoding_sample, encoder_types)

In [None]:
df = apply_encoders(encoding_sample, encoders)

In [None]:
df

In [None]:
df.to_dict(orient="list")

In [None]:
expected_df = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "c_cat_col_for_binary_0": [0, 0, 1, 0, 1],
        "c_cat_col_for_binary_1": [1, 1, 0, 1, 0],
        "c_cat_col_for_onehot_cat1": [1, 0, 0, 0, 0],
        "c_cat_col_for_onehot_cat2": [0, 1, 0, 0, 1],
        "c_cat_col_for_onehot_cat3": [0, 0, 1, 1, 0],
        "c_cat_col_for_ordinal": [1, 2, 3, 1, 2]
    }
)

In [None]:
df.equals(expected_df)

## Exclusion

In [None]:
import pandas as pd

In [None]:
exclusion_sample = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "n_col_to_use": [3.2, 2.1, 9.8, 0.3, 3.1],
        "c_col_to_use_0": [0, 1, 0, 0, 0],
        "c_col_to_use_1": [1, 0, 1, 1, 1],
        "c_first_col_to_exclude": [1, 3, 2, 1, 2],
        "c_second_col_to_exclude_cat1": [1, 1, 0, 0, 0],
        "c_second_col_to_exclude_cat2": [0, 0, 1, 1, 0],
        "c_second_col_to_exclude_cat2": [0, 0, 0, 0, 1],
    }
)

In [None]:
exclusion_sample

In [None]:
features_to_exclude = ["c_first_col_to_exclude", "c_second_col_to_exclude"]

In [None]:
from ga4_mlops.pipelines.feature_engineering.nodes import exclude_features

In [None]:
df = exclude_features(exclusion_sample, features_to_exclude)

In [None]:
df

In [None]:
expected_df = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "n_col_to_use": [3.2, 2.1, 9.8, 0.3, 3.1],
        "c_col_to_use_0": [0, 1, 0, 0, 0],
        "c_col_to_use_1": [1, 0, 1, 1, 1],
    }
)

In [None]:
expected_df

In [None]:
df.equals(expected_df)

## Prediction

In [None]:
import numpy as np
import pandas as pd

In [None]:
abt_sample = pd.DataFrame(
    {
        "i_id": [1, 2, 3, 4, 5],
        "i_info_col": ["2022", "2021", "2022", "2022", "2021"],
        "n_num_col": [1.3, 5.6, 2.3, 7.6, 9.2],
        "c_cat_col": [1, 3, 3, 1, 2],
        "y_target_col": [1, 0, 0, 1, 0],
    }
)
raw_scores_sample = np.array([0.99, 0.02, 0.01, 0.98, 0.01])
calibrated_scores_sample = np.array([0.88, 0.42, 0.23, 0.84, 0.22])

In [None]:
abt_sample

In [None]:
raw_scores_sample, calibrated_scores_sample

In [None]:
from ga4_mlops.pipelines.prediction.nodes import create_predictions

In [None]:
df = create_predictions(
    abt_sample,
    raw_scores_sample,
    calibrated_scores_sample,
    threshold=0.3,
    classify_on_calibrated=False
)

In [None]:
df

In [None]:
df = create_predictions(
    abt_sample,
    raw_scores_sample,
    calibrated_scores_sample,
    threshold=0.3,
    classify_on_calibrated=True
)

In [None]:
df

In [None]:
df.to_dict(orient="list")

## Explanation - sampling

In [None]:
import pandas as pd

In [None]:
explanation_sample = pd.DataFrame(
    {
        "i_id": list(range(20)),
        "y_target": [1]*5 + [0]*15
    }
)

In [None]:
explanation_sample

In [None]:
from ga4_mlops.pipelines.explanation.nodes import sample_data

In [None]:
df = sample_data(explanation_sample, n_obs=8, seed=42)

In [None]:
df

In [None]:
df.to_dict(orient="list")

## Ensure column types v2

In [None]:
import numpy as np
import pandas as pd

In [None]:
wrong_column_types_sample = pd.DataFrame(
        {
            "c_first_good_coltype": ["cat1", "cat2", "cat1"],
            "n_second_good_coltype": [1, 3, 5],
            "c_third_good_coltype": ["cat2", pd.NA, "cat3"],
            "n_fourth_good_coltype": [4.2, np.NaN, 2.9],
            "c_fifth_good_coltype": [pd.NA, pd.NA, pd.NA],
            "n_sixth_good_coltype": [np.NaN, np.NaN, np.NaN],
            "c_first_bad_coltype": [7, 8, 9],
            "c_second_bad_coltype": [4.1, 2.3, 8.9],
            "n_third_bad_coltype": ["3.0", "2.2", "5.6"],
            "n_fourth_bad_coltype": [4.2, pd.NA, 2.9],
            "n_fifth_bad_coltype": [3, pd.NA, 2],
            "c_sixth_bad_coltype": [np.NaN, np.NaN, np.NaN],
            "n_seventh_bad_coltype": [pd.NA, pd.NA, pd.NA],
        }
    )

In [None]:
wrong_column_types_sample

In [None]:
wrong_column_types_sample.dtypes

In [None]:
from ga4_mlops.pipelines.data_preparation_utils import extract_column_names, ensure_column_types

In [None]:
_, num_cols, cat_cols, _ = extract_column_names(wrong_column_types_sample)

In [None]:
df = ensure_column_types(wrong_column_types_sample, num_cols, cat_cols)

In [None]:
df

In [None]:
df.dtypes

In [None]:
df["c_fifth_good_coltype"]

In [None]:
type(df["c_fifth_good_coltype"][0])

In [None]:
df["c_sixth_bad_coltype"]

In [None]:
type(df["c_sixth_bad_coltype"][0])

In [None]:
list(df.dtypes)