In [136]:
import pandas as pd
import os
import sys

In [137]:
sys.path.append("../src/ga4_mlops/pipelines")

In [138]:
from data_preparation_utils import extract_column_names

In [139]:
df = pd.read_csv('../data/05_model_input/abt_test.csv')

In [140]:
_, _, _, target_col = extract_column_names(df)
target_col

'y_added_to_cart'

In [141]:
df[target_col].value_counts()

0    1934
1      66
Name: y_added_to_cart, dtype: int64

In [142]:
df.shape[0]

2000

In [143]:
df[target_col].value_counts() / df.shape[0]

0    0.967
1    0.033
Name: y_added_to_cart, dtype: float64

In [144]:
n_obs = 500
seed = 42

In [145]:
frac = n_obs / df.shape[0]

In [146]:
df_sample = df.groupby(target_col).apply(lambda x: x.sample(frac=frac, random_state=seed)).reset_index(drop=True)
df_sample

Unnamed: 0,i_full_visitor_id,i_visit_start_time,i_date,c_device_category_tablet,c_device_category_desktop,c_device_category_mobile,c_is_mobile_0,c_is_mobile_1,c_operating_system_Web,c_operating_system_Windows,...,c_is_first_visit_0,c_is_first_visit_1,n_product_pages_viewed,n_total_hits,n_total_pageviews,n_total_visits,n_total_time_on_site,y_added_to_cart,c_weekday,c_visit_start_hour
0,3.088863e+07,1.612025e+15,20210130,0,0,1,1,0,1,0,...,1,0,0.0,10.0,4.0,1.0,986783.0,0,-1.0,9
1,7.998526e+07,1.611977e+15,20210130,0,1,0,0,1,0,1,...,1,0,0.0,6.0,2.0,1.0,22059.0,0,-1.0,25
2,8.169996e+07,1.612001e+15,20210130,0,0,1,1,0,1,0,...,1,0,0.0,6.0,2.0,1.0,9369.0,0,-1.0,4
3,8.528821e+07,1.612001e+15,20210130,0,0,1,1,0,0,0,...,1,0,0.0,5.0,2.0,1.0,11548.0,0,-1.0,4
4,3.214840e+06,1.611988e+15,20210130,0,1,0,0,1,1,0,...,1,0,0.0,8.0,3.0,1.0,8853.0,0,-1.0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,8.310014e+07,1.611970e+15,20210130,0,0,1,1,0,1,0,...,0,1,1.0,16.0,6.0,3.0,7013.0,1,-1.0,17
496,7.747805e+07,1.612003e+15,20210130,0,1,0,0,1,1,0,...,0,1,1.0,194.0,49.0,1.0,74605.0,1,-1.0,4
497,1.969834e+07,1.612027e+15,20210130,0,1,0,0,1,1,0,...,1,0,1.0,17.0,5.0,1.0,23742.0,1,-1.0,5
498,3.200577e+06,,20210130,0,0,1,1,0,0,0,...,0,1,1.0,62.0,17.0,2.0,37408.0,1,4.0,14


In [147]:
df_sample[target_col].value_counts()

0    484
1     16
Name: y_added_to_cart, dtype: int64

In [148]:
df_sample.shape[0]

500

In [150]:
proportions = df_sample[target_col].value_counts() / df_sample.shape[0]
proportions

0    0.968
1    0.032
Name: y_added_to_cart, dtype: float64

In [153]:
proportions.to_string()

'0    0.968\n1    0.032'

---

In [6]:
import shap
import pandas as pd
import xgboost as xgb
import pickle
sys.path.append("../src/ga4_mlops/pipelines")
from data_preparation_utils import extract_column_names

In [8]:
def sample_data(abt: pd.DataFrame, n_obs: int, seed: int) -> pd.DataFrame:
    """Sample model input data preserving target proportions.

    Args:
        abt (pd.DataFrame): input data frame
        n_obs (int): number of observations in a sample

    Returns:
        pd.DataFrame: data frame sample
    """
    original_n_obs = abt.shape[0]
    n_obs = max(1, min(n_obs, original_n_obs))
    # logger.info(
    #     f"Sampling data for SHAP explanations. Original size: {original_n_obs}; Sample size: {n_obs}"
    # )

    _, _, _, target_col = extract_column_names(abt)
    # logger.info(f"Target name: {target_col}")

    original_proportions = abt[target_col].value_counts() / original_n_obs
    # logger.info(f"Original target proportions:\n{original_proportions.to_string()}")

    frac = n_obs / original_n_obs
    abt_sample = (
        abt.groupby(target_col)
        .apply(lambda x: x.sample(frac=frac, random_state=seed))
        .reset_index(drop=True)
    )

    proportions = abt_sample[target_col].value_counts() / n_obs
    # logger.info(f"Sample target proportions:\n{proportions.to_string()}")

    return abt_sample

In [9]:
abt = pd.read_csv('../data/05_model_input/abt_test.csv')

In [10]:
abt_sample = sample_data(abt, 100, 42)

In [11]:
_, num_cols, cat_cols, _ = extract_column_names(abt_sample)

In [12]:
with open('../data/06_models/model.pkl', 'rb') as pickle_file:
    model = pickle.load(pickle_file)