In [48]:
import pandas as pd
import os
from pprint import PrettyPrinter

In [49]:
os.getcwd()

'/Users/pchaberski/projects/gid-ml-framework/ga4-mlops/notebooks'

In [50]:
pp = PrettyPrinter()

# Primary data sample

In [10]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """Apply manual feature engineering transformations.

    Args:
        df (pd.DataFrame): data frame with raw features

    Returns:
        pd.DataFrame: data frame after feature engineering
    """
    # logger.info("Applying manual feature engineering transformations...")

    df["c_weekday"] = pd.to_datetime(df["i_visit_start_time"], unit="us").dt.weekday
    df["c_visit_start_hour"] = pd.to_datetime(
        df["i_visit_start_time"], unit="us"
    ).dt.hour

    return df

In [11]:
df = pd.read_csv('../data/03_primary/df_train.csv')

In [31]:
dfs = df.sample(20, random_state=22)
dfs

Unnamed: 0,i_full_visitor_id,i_visit_start_time,i_date,c_device_category,c_is_mobile,c_operating_system,c_browser,c_country,c_city,c_traffic_source,c_traffic_medium,c_traffic_campaign,c_is_first_visit,n_product_pages_viewed,n_total_hits,n_total_pageviews,n_total_visits,n_total_time_on_site,y_added_to_cart
2077,6357417.0,1611862000000000.0,20210128,mobile,1,iOS,Safari,Spain,Valencia,google,organic,,1,0,6,2,1,3427.0,0
9207,17443470.0,1611662000000000.0,20210126,desktop,0,Web,Chrome,Belgium,(not set),(direct),(none),,1,0,6,2,1,6878.0,0
6755,5524508.0,1611768000000000.0,20210127,desktop,0,Web,Chrome,United States,(not set),(data deleted),(data deleted),,0,1,72,28,2,110816.0,0
6704,70907670.0,1611673000000000.0,20210126,desktop,0,Web,Chrome,United States,(not set),<Other>,organic,,1,0,9,3,1,44856.0,0
9528,3972438.0,1611868000000000.0,20210128,mobile,1,Web,Safari,United States,Ashburn,shop.googlemerchandisestore.com,referral,,1,1,9,3,1,14028.0,0
7421,58937620.0,1611847000000000.0,20210128,desktop,0,Web,Chrome,India,Mysuru,<Other>,referral,,1,0,6,2,1,4527.0,0
2578,5729145.0,1611792000000000.0,20210128,mobile,1,iOS,Chrome,Poland,Krakow,<Other>,<Other>,,1,0,6,2,1,15600.0,0
9479,8483017.0,1611645000000000.0,20210126,mobile,1,iOS,Safari,United States,(not set),<Other>,<Other>,,1,0,10,4,1,19357.0,0
2176,61199210.0,1611667000000000.0,20210126,mobile,1,Android,Chrome,United States,Las Vegas,google,organic,,1,0,3,1,1,,0
8572,7802003000.0,1611785000000000.0,20210127,mobile,1,iOS,Safari,United States,Atlanta,<Other>,<Other>,,1,0,22,11,2,29476.0,0


## Engineer features

In [13]:
rows_to_select = list(range(3))
rows_to_select

[0, 1, 2]

In [14]:
columns_to_select = [
    "i_full_visitor_id",
    "i_visit_start_time"
]
columns_to_select

['i_full_visitor_id', 'i_visit_start_time']

In [32]:
df_sample = dfs.iloc[rows_to_select, :][columns_to_select]
df_sample

Unnamed: 0,i_full_visitor_id,i_visit_start_time
2077,6357417.0,1611862000000000.0
9207,17443470.0,1611662000000000.0
6755,5524508.0,1611768000000000.0


In [16]:
dict_sample = df_sample.to_dict(orient="list")

In [17]:
pp.pprint(dict_sample)

{'i_full_visitor_id': [3033910.355860057, 81793309.0616803, 4293031.296243032],
 'i_visit_start_time': [1611619614341157.0,
                        1611820704267587.0,
                        1611661585573344.0]}


In [18]:
df_processed = engineer_features(df_sample)
df_processed

Unnamed: 0,i_full_visitor_id,i_visit_start_time,c_weekday,c_visit_start_hour
0,3033910.0,1611620000000000.0,1,0
1,81793310.0,1611821000000000.0,3,7
2,4293031.0,1611662000000000.0,1,11


In [19]:
dict_processed = df_processed.to_dict(orient="list")

In [20]:
pp.pprint(dict_processed)

{'c_visit_start_hour': [0, 7, 11],
 'c_weekday': [1, 3, 1],
 'i_full_visitor_id': [3033910.355860057, 81793309.0616803, 4293031.296243032],
 'i_visit_start_time': [1611619614341157.0,
                        1611820704267587.0,
                        1611661585573344.0]}


In [22]:
df_expected = pd.DataFrame({
        'c_visit_start_hour': [0, 7, 11],
        'c_weekday': [1, 3, 1],
        'i_full_visitor_id': [3033910.355860057, 81793309.0616803, 4293031.296243032],
        'i_visit_start_time': [1611619614341157.0,
                                1611820704267587.0,
                                1611661585573344.0]
    })

In [23]:
df_processed

Unnamed: 0,i_full_visitor_id,i_visit_start_time,c_weekday,c_visit_start_hour
0,3033910.0,1611620000000000.0,1,0
1,81793310.0,1611821000000000.0,3,7
2,4293031.0,1611662000000000.0,1,11


In [24]:
df_expected

Unnamed: 0,c_visit_start_hour,c_weekday,i_full_visitor_id,i_visit_start_time
0,0,1,3033910.0,1611620000000000.0
1,7,3,81793310.0,1611821000000000.0
2,11,1,4293031.0,1611662000000000.0


In [25]:
df_processed.equals(df_expected)

False

In [26]:
df_processed.sort_index(axis=1).equals(df_expected.sort_index(axis=1))

True

## Data preparation utils

In [14]:
df = pd.DataFrame(
    {
        "c_first_good_colname": ["cat1", "cat2", "cat1"],
        "n_second_good_colname": [1, 3, 5],
        "c_first_bad_colname": [7, 8, 9],
        "n_second_bad_colname": ["3.0", "2.2", "5.6"],
    }
)

In [15]:
df

Unnamed: 0,c_first_good_colname,n_second_good_colname,c_first_bad_colname,n_second_bad_colname
0,cat1,1,7,3.0
1,cat2,3,8,2.2
2,cat1,5,9,5.6


In [16]:
df.dtypes

c_first_good_colname     object
n_second_good_colname     int64
c_first_bad_colname       int64
n_second_bad_colname     object
dtype: object

In [17]:
from ga4_mlops.pipelines.data_preparation_utils import extract_column_names, ensure_column_types

In [18]:
_, num_cols, cat_cols, _ = extract_column_names(df)

In [19]:
df2 = ensure_column_types(df, num_cols, cat_cols)

In [20]:
df2

Unnamed: 0,c_first_good_colname,n_second_good_colname,c_first_bad_colname,n_second_bad_colname
0,cat1,1.0,7,3.0
1,cat2,3.0,8,2.2
2,cat1,5.0,9,5.6


In [21]:
df2.dtypes

c_first_good_colname      object
n_second_good_colname    float64
c_first_bad_colname       object
n_second_bad_colname     float64
dtype: object

In [22]:
list(df2.dtypes)

[dtype('O'), dtype('float64'), dtype('O'), dtype('float64')]

In [27]:
df2.dtypes.to_list()

[dtype('O'), dtype('float64'), dtype('O'), dtype('float64')]

In [28]:
num_cols = ["n_second_good_colname", "n_second_bad_colname"]
cat_cols = ["c_first_good_colname", "c_first_bad_colname"]
df3 = ensure_column_types(df, num_cols, cat_cols)

In [29]:
df3

Unnamed: 0,c_first_good_colname,n_second_good_colname,c_first_bad_colname,n_second_bad_colname
0,cat1,1.0,7,3.0
1,cat2,3.0,8,2.2
2,cat1,5.0,9,5.6


In [30]:
from numpy import dtype

In [31]:
expected_types = [dtype('O'), dtype('float64'), dtype('O'), dtype('float64')]

In [32]:
expected_types

[dtype('O'), dtype('float64'), dtype('O'), dtype('float64')]

In [34]:
df3.dtypes.to_list()

[dtype('O'), dtype('float64'), dtype('O'), dtype('float64')]

## Imputation

In [26]:
import pandas as pd
import numpy as np

In [27]:
imputation_sample = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "n_num_col_complete": [4, 2, 6, 1, 0],
        "n_num_col_missing_not_listed": [4.0, np.nan, 2.1, 3.2, np.nan],
        "n_num_col_missing_for_mean": [2, np.nan, 5, np.nan, 8],
        "n_num_col_missing_for_zero": [np.nan, 9.3, -1.2, np.nan, np.nan],
        "c_cat_col_for_mostfreq": ["cat1", np.nan, "cat2", "cat1", "cat1"],
        "c_cat_col_for_unknown": ["cat1", "cat2", np.nan, "cat3", "cat2"],
    }
)

In [28]:
imputation_sample

Unnamed: 0,i_id_col,n_num_col_complete,n_num_col_missing_not_listed,n_num_col_missing_for_mean,n_num_col_missing_for_zero,c_cat_col_for_mostfreq,c_cat_col_for_unknown
0,1,4,4.0,2.0,,cat1,cat1
1,2,2,,,9.3,,cat2
2,3,6,2.1,5.0,-1.2,cat2,
3,4,1,3.2,,,cat1,cat3
4,5,0,,8.0,,cat1,cat2


In [29]:
imputation_sample.iloc[1, 3]

nan

In [30]:
imputation_sample.iloc[0, 4]

nan

In [31]:
from ga4_mlops.pipelines.feature_engineering.nodes import fit_imputers, apply_imputers

In [39]:
expected_df = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "n_num_col_complete": [4., 2., 6., 1., 0.],
        "n_num_col_missing_not_listed": [4.0, np.nan, 2.1, 3.2, np.nan],
        "n_num_col_missing_for_mean": [2., 5., 5., 5., 8.],
        "n_num_col_missing_for_zero": [0.0, 9.3, -1.2, 0.0, 0.0],
        "c_cat_col_for_mostfreq": ["cat1", "cat1", "cat2", "cat1", "cat1"],
        "c_cat_col_for_unknown": ["cat1", "cat2", "UNKNOWN", "cat3", "cat2"],
    }
)

In [40]:
expected_df

Unnamed: 0,i_id_col,n_num_col_complete,n_num_col_missing_not_listed,n_num_col_missing_for_mean,n_num_col_missing_for_zero,c_cat_col_for_mostfreq,c_cat_col_for_unknown
0,1,4.0,4.0,2.0,0.0,cat1,cat1
1,2,2.0,,5.0,9.3,cat1,cat2
2,3,6.0,2.1,5.0,-1.2,cat2,UNKNOWN
3,4,1.0,3.2,5.0,0.0,cat1,cat3
4,5,0.0,,8.0,0.0,cat1,cat2


In [41]:
imputation_strategies = {
  "mean": ["n_num_col_missing_for_mean"],   # for numerical: replace NULLs with column mean
  "zero": ["n_num_col_missing_for_zero"],    # for numerical: replace NULLs with zeros
  "mostfreq": ["c_cat_col_for_mostfreq"],    # for categorical: replace NULLs with most frequent value
  "unknown": ["c_cat_col_for_unknown"],    # for categorical: replace NULLs with UNKNOWN token
}

In [42]:
imputers = fit_imputers(imputation_sample, imputation_strategies)

In [43]:
imputers

{'mean': SimpleImputer(),
 'zero': SimpleImputer(fill_value=0.0, strategy='constant'),
 'mostfreq': SimpleImputer(strategy='most_frequent'),
 'unknown': SimpleImputer(fill_value='UNKNOWN', strategy='constant')}

In [44]:
df = apply_imputers(imputation_sample, imputers)

In [45]:
df

Unnamed: 0,i_id_col,n_num_col_complete,n_num_col_missing_not_listed,n_num_col_missing_for_mean,n_num_col_missing_for_zero,c_cat_col_for_mostfreq,c_cat_col_for_unknown
0,1,4.0,4.0,2.0,0.0,cat1,cat1
1,2,2.0,,5.0,9.3,cat1,cat2
2,3,6.0,2.1,5.0,-1.2,cat2,UNKNOWN
3,4,1.0,3.2,5.0,0.0,cat1,cat3
4,5,0.0,,8.0,0.0,cat1,cat2


In [47]:
df.dtypes

i_id_col                          int64
n_num_col_complete              float64
n_num_col_missing_not_listed    float64
n_num_col_missing_for_mean      float64
n_num_col_missing_for_zero      float64
c_cat_col_for_mostfreq           object
c_cat_col_for_unknown            object
dtype: object

In [48]:
expected_df.dtypes

i_id_col                          int64
n_num_col_complete              float64
n_num_col_missing_not_listed    float64
n_num_col_missing_for_mean      float64
n_num_col_missing_for_zero      float64
c_cat_col_for_mostfreq           object
c_cat_col_for_unknown            object
dtype: object

In [49]:
df.equals(expected_df)

True

## Encoding

In [1]:
import pandas as pd

In [2]:
encoding_sample = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "c_cat_col_for_binary": ["0", "0", "1", "0", "1"],
        "c_cat_col_for_onehot": ["cat1", "cat2", "cat3", "cat3", "cat2"],
        "c_cat_col_for_ordinal": ["cat3", "cat2", "cat1", "cat3", "cat2"],
    }
)

In [6]:
encoding_sample

Unnamed: 0,i_id_col,c_cat_col_for_binary,c_cat_col_for_onehot,c_cat_col_for_ordinal
0,1,0,cat1,cat3
1,2,0,cat2,cat2
2,3,1,cat3,cat1
3,4,0,cat3,cat3
4,5,1,cat2,cat2


In [7]:
encoder_types = {
    "binary": ["c_cat_col_for_binary"],   # for binary variables
    "onehot": ["c_cat_col_for_onehot"],   # one-hot encoding
    "ordinal": ["c_cat_col_for_ordinal"]   # integer encoding (ordinal, but order does not matter)
}

In [8]:
from ga4_mlops.pipelines.feature_engineering.nodes import fit_encoders, apply_encoders

In [9]:
encoders = fit_encoders(encoding_sample, encoder_types)

  for cat_name, class_ in values.iteritems():


In [10]:
df = apply_encoders(encoding_sample, encoders)

In [11]:
df

Unnamed: 0,i_id_col,c_cat_col_for_binary_0,c_cat_col_for_binary_1,c_cat_col_for_onehot_cat1,c_cat_col_for_onehot_cat2,c_cat_col_for_onehot_cat3,c_cat_col_for_ordinal
0,1,0,1,1,0,0,1
1,2,0,1,0,1,0,2
2,3,1,0,0,0,1,3
3,4,0,1,0,0,1,1
4,5,1,0,0,1,0,2


In [13]:
df.to_dict(orient="list")

{'i_id_col': [1, 2, 3, 4, 5],
 'c_cat_col_for_binary_0': [0, 0, 1, 0, 1],
 'c_cat_col_for_binary_1': [1, 1, 0, 1, 0],
 'c_cat_col_for_onehot_cat1': [1, 0, 0, 0, 0],
 'c_cat_col_for_onehot_cat2': [0, 1, 0, 0, 1],
 'c_cat_col_for_onehot_cat3': [0, 0, 1, 1, 0],
 'c_cat_col_for_ordinal': [1, 2, 3, 1, 2]}

In [14]:
expected_df = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "c_cat_col_for_binary_0": [0, 0, 1, 0, 1],
        "c_cat_col_for_binary_1": [1, 1, 0, 1, 0],
        "c_cat_col_for_onehot_cat1": [1, 0, 0, 0, 0],
        "c_cat_col_for_onehot_cat2": [0, 1, 0, 0, 1],
        "c_cat_col_for_onehot_cat3": [0, 0, 1, 1, 0],
        "c_cat_col_for_ordinal": [1, 2, 3, 1, 2]
    }
)

In [15]:
df.equals(expected_df)

True

## Exclusion

In [21]:
import pandas as pd

In [22]:
exclusion_sample = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "n_col_to_use": [3.2, 2.1, 9.8, 0.3, 3.1],
        "c_col_to_use_0": [0, 1, 0, 0, 0],
        "c_col_to_use_1": [1, 0, 1, 1, 1],
        "c_first_col_to_exclude": [1, 3, 2, 1, 2],
        "c_second_col_to_exclude_cat1": [1, 1, 0, 0, 0],
        "c_second_col_to_exclude_cat2": [0, 0, 1, 1, 0],
        "c_second_col_to_exclude_cat2": [0, 0, 0, 0, 1],
    }
)

In [23]:
exclusion_sample

Unnamed: 0,i_id_col,n_col_to_use,c_col_to_use_0,c_col_to_use_1,c_first_col_to_exclude,c_second_col_to_exclude_cat1,c_second_col_to_exclude_cat2
0,1,3.2,0,1,1,1,0
1,2,2.1,1,0,3,1,0
2,3,9.8,0,1,2,0,0
3,4,0.3,0,1,1,0,0
4,5,3.1,0,1,2,0,1


In [27]:
features_to_exclude = ["c_first_col_to_exclude", "c_second_col_to_exclude"]

In [28]:
from ga4_mlops.pipelines.feature_engineering.nodes import exclude_features

In [31]:
df = exclude_features(exclusion_sample, features_to_exclude)

In [32]:
df

Unnamed: 0,i_id_col,n_col_to_use,c_col_to_use_0,c_col_to_use_1
0,1,3.2,0,1
1,2,2.1,1,0
2,3,9.8,0,1
3,4,0.3,0,1
4,5,3.1,0,1


In [33]:
expected_df = pd.DataFrame(
    {
        "i_id_col": [1, 2, 3, 4, 5],
        "n_col_to_use": [3.2, 2.1, 9.8, 0.3, 3.1],
        "c_col_to_use_0": [0, 1, 0, 0, 0],
        "c_col_to_use_1": [1, 0, 1, 1, 1],
    }
)

In [34]:
expected_df

Unnamed: 0,i_id_col,n_col_to_use,c_col_to_use_0,c_col_to_use_1
0,1,3.2,0,1
1,2,2.1,1,0
2,3,9.8,0,1
3,4,0.3,0,1
4,5,3.1,0,1


In [35]:
df.equals(expected_df)

True

## Prediction

In [36]:
import numpy as np
import pandas as pd

In [40]:
abt_sample = pd.DataFrame(
    {
        "i_id": [1, 2, 3, 4, 5],
        "i_info_col": ["2022", "2021", "2022", "2022", "2021"],
        "n_num_col": [1.3, 5.6, 2.3, 7.6, 9.2],
        "c_cat_col": [1, 3, 3, 1, 2],
        "y_target_col": [1, 0, 0, 1, 0],
    }
)
raw_scores_sample = np.array([0.99, 0.02, 0.01, 0.98, 0.01])
calibrated_scores_sample = np.array([0.88, 0.42, 0.23, 0.84, 0.22])

In [41]:
abt_sample

Unnamed: 0,i_id,i_info_col,n_num_col,c_cat_col,y_target_col
0,1,2022,1.3,1,1
1,2,2021,5.6,3,0
2,3,2022,2.3,3,0
3,4,2022,7.6,1,1
4,5,2021,9.2,2,0


In [42]:
raw_scores_sample, calibrated_scores_sample

(array([0.99, 0.02, 0.01, 0.98, 0.01]), array([0.88, 0.42, 0.23, 0.84, 0.22]))

In [43]:
from ga4_mlops.pipelines.prediction.nodes import create_predictions

In [44]:
df = create_predictions(
    abt_sample,
    raw_scores_sample,
    calibrated_scores_sample,
    threshold=0.3,
    classify_on_calibrated=False
)

In [45]:
df

Unnamed: 0,i_id,i_info_col,y_raw_score,y_calibrated_score,y_predicted_label
0,1,2022,0.99,0.88,1
1,2,2021,0.02,0.42,0
2,3,2022,0.01,0.23,0
3,4,2022,0.98,0.84,1
4,5,2021,0.01,0.22,0


In [46]:
df = create_predictions(
    abt_sample,
    raw_scores_sample,
    calibrated_scores_sample,
    threshold=0.3,
    classify_on_calibrated=True
)

In [47]:
df

Unnamed: 0,i_id,i_info_col,y_raw_score,y_calibrated_score,y_predicted_label
0,1,2022,0.99,0.88,1
1,2,2021,0.02,0.42,1
2,3,2022,0.01,0.23,0
3,4,2022,0.98,0.84,1
4,5,2021,0.01,0.22,0


In [51]:
df.to_dict(orient="list")

{'i_id': [1, 2, 3, 4, 5],
 'i_info_col': ['2022', '2021', '2022', '2022', '2021'],
 'y_raw_score': [0.99, 0.02, 0.01, 0.98, 0.01],
 'y_calibrated_score': [0.88, 0.42, 0.23, 0.84, 0.22],
 'y_predicted_label': [1, 1, 0, 1, 0]}

## Explanation - sampling

In [52]:
import pandas as pd

In [54]:
explanation_sample = pd.DataFrame(
    {
        "i_id": list(range(20)),
        "y_target": [1]*5 + [0]*15
    }
)

In [57]:
explanation_sample

Unnamed: 0,i_id,y_target
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0


In [55]:
from ga4_mlops.pipelines.explanation.nodes import sample_data

In [69]:
df = sample_data(explanation_sample, n_obs=8, seed=42)

In [70]:
df

Unnamed: 0,i_id,y_target
0,14,0
1,16,0
2,5,0
3,18,0
4,10,0
5,13,0
6,1,1
7,4,1


In [71]:
df.to_dict(orient="list")

{'i_id': [14, 16, 5, 18, 10, 13, 1, 4], 'y_target': [0, 0, 0, 0, 0, 0, 1, 1]}