In [7]:
import pandas as pd
import os
from pprint import PrettyPrinter

In [8]:
os.getcwd()

'/Users/pchaberski/projects/gid-ml-framework/ga4-mlops/notebooks'

In [9]:
pp = PrettyPrinter()

# Primary data sample

In [10]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """Apply manual feature engineering transformations.

    Args:
        df (pd.DataFrame): data frame with raw features

    Returns:
        pd.DataFrame: data frame after feature engineering
    """
    # logger.info("Applying manual feature engineering transformations...")

    df["c_weekday"] = pd.to_datetime(df["i_visit_start_time"], unit="us").dt.weekday
    df["c_visit_start_hour"] = pd.to_datetime(
        df["i_visit_start_time"], unit="us"
    ).dt.hour

    return df

In [11]:
df = pd.read_csv('../data/03_primary/df_train.csv')

In [31]:
dfs = df.sample(20, random_state=22)
dfs

Unnamed: 0,i_full_visitor_id,i_visit_start_time,i_date,c_device_category,c_is_mobile,c_operating_system,c_browser,c_country,c_city,c_traffic_source,c_traffic_medium,c_traffic_campaign,c_is_first_visit,n_product_pages_viewed,n_total_hits,n_total_pageviews,n_total_visits,n_total_time_on_site,y_added_to_cart
2077,6357417.0,1611862000000000.0,20210128,mobile,1,iOS,Safari,Spain,Valencia,google,organic,,1,0,6,2,1,3427.0,0
9207,17443470.0,1611662000000000.0,20210126,desktop,0,Web,Chrome,Belgium,(not set),(direct),(none),,1,0,6,2,1,6878.0,0
6755,5524508.0,1611768000000000.0,20210127,desktop,0,Web,Chrome,United States,(not set),(data deleted),(data deleted),,0,1,72,28,2,110816.0,0
6704,70907670.0,1611673000000000.0,20210126,desktop,0,Web,Chrome,United States,(not set),<Other>,organic,,1,0,9,3,1,44856.0,0
9528,3972438.0,1611868000000000.0,20210128,mobile,1,Web,Safari,United States,Ashburn,shop.googlemerchandisestore.com,referral,,1,1,9,3,1,14028.0,0
7421,58937620.0,1611847000000000.0,20210128,desktop,0,Web,Chrome,India,Mysuru,<Other>,referral,,1,0,6,2,1,4527.0,0
2578,5729145.0,1611792000000000.0,20210128,mobile,1,iOS,Chrome,Poland,Krakow,<Other>,<Other>,,1,0,6,2,1,15600.0,0
9479,8483017.0,1611645000000000.0,20210126,mobile,1,iOS,Safari,United States,(not set),<Other>,<Other>,,1,0,10,4,1,19357.0,0
2176,61199210.0,1611667000000000.0,20210126,mobile,1,Android,Chrome,United States,Las Vegas,google,organic,,1,0,3,1,1,,0
8572,7802003000.0,1611785000000000.0,20210127,mobile,1,iOS,Safari,United States,Atlanta,<Other>,<Other>,,1,0,22,11,2,29476.0,0


## Engineer features

In [13]:
rows_to_select = list(range(3))
rows_to_select

[0, 1, 2]

In [14]:
columns_to_select = [
    "i_full_visitor_id",
    "i_visit_start_time"
]
columns_to_select

['i_full_visitor_id', 'i_visit_start_time']

In [32]:
df_sample = dfs.iloc[rows_to_select, :][columns_to_select]
df_sample

Unnamed: 0,i_full_visitor_id,i_visit_start_time
2077,6357417.0,1611862000000000.0
9207,17443470.0,1611662000000000.0
6755,5524508.0,1611768000000000.0


In [16]:
dict_sample = df_sample.to_dict(orient="list")

In [17]:
pp.pprint(dict_sample)

{'i_full_visitor_id': [3033910.355860057, 81793309.0616803, 4293031.296243032],
 'i_visit_start_time': [1611619614341157.0,
                        1611820704267587.0,
                        1611661585573344.0]}


In [18]:
df_processed = engineer_features(df_sample)
df_processed

Unnamed: 0,i_full_visitor_id,i_visit_start_time,c_weekday,c_visit_start_hour
0,3033910.0,1611620000000000.0,1,0
1,81793310.0,1611821000000000.0,3,7
2,4293031.0,1611662000000000.0,1,11


In [19]:
dict_processed = df_processed.to_dict(orient="list")

In [20]:
pp.pprint(dict_processed)

{'c_visit_start_hour': [0, 7, 11],
 'c_weekday': [1, 3, 1],
 'i_full_visitor_id': [3033910.355860057, 81793309.0616803, 4293031.296243032],
 'i_visit_start_time': [1611619614341157.0,
                        1611820704267587.0,
                        1611661585573344.0]}


In [22]:
df_expected = pd.DataFrame({
        'c_visit_start_hour': [0, 7, 11],
        'c_weekday': [1, 3, 1],
        'i_full_visitor_id': [3033910.355860057, 81793309.0616803, 4293031.296243032],
        'i_visit_start_time': [1611619614341157.0,
                                1611820704267587.0,
                                1611661585573344.0]
    })

In [23]:
df_processed

Unnamed: 0,i_full_visitor_id,i_visit_start_time,c_weekday,c_visit_start_hour
0,3033910.0,1611620000000000.0,1,0
1,81793310.0,1611821000000000.0,3,7
2,4293031.0,1611662000000000.0,1,11


In [24]:
df_expected

Unnamed: 0,c_visit_start_hour,c_weekday,i_full_visitor_id,i_visit_start_time
0,0,1,3033910.0,1611620000000000.0
1,7,3,81793310.0,1611821000000000.0
2,11,1,4293031.0,1611662000000000.0


In [25]:
df_processed.equals(df_expected)

False

In [26]:
df_processed.sort_index(axis=1).equals(df_expected.sort_index(axis=1))

True