# Feature engineering

## Setup

In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import os
import sys

import dill
import numpy as np
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from src.algo.ranker.utils import calc_sequence_timestamp_bucket
sys.path.insert(0, "..")

from src.utils.embedding_id_mapper import IDMapper
from src.algo.ranker.utils import pad_timestamp_sequence


In [27]:
class Args(BaseModel):
    run_name: str = "000-prep-data"
    testing: bool = True
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    tfm_chunk_size: int = 5000

    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "000-prep-data",
  "testing": true,
  "notebook_persist_dp": "c:\\Users\\Trieu\\OneDrive\\Desktop\\recsys\\real_time_recsys\\notebooks\\data\\000-prep-data",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "tfm_chunk_size": 5000,
  "sequence_length": 10
}


## Load data

In [28]:
metadata_raw = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Electronics", trust_remote_code=True
)
metadata_raw_df = metadata_raw["full"].to_pandas()
metadata_raw_df

Downloading builder script:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/30.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.25G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
with pd.option_context("display.max_colwidth", None):
    display(
        metadata_raw_df.iloc[[6]][
            [
                "title",
                "main_category",
                "categories",
                "features",
                "description",
                "store",
                "details",
            ]
        ]
    )

Unnamed: 0,title,main_category,categories,features,description,store,details
6,"QGHXO Band for Garmin Vivofit 4, Soft Silicone Replacement Watch Band Strap for Garmin Vivofit 4 Activity Tracker, Small, Large, Ten Colors (5PCS Bands-Girl, Large)",Cell Phones & Accessories,"[Electronics, Wearable Technology, Arm & Wristband Accessories]","[Personalized Your Garmin Vivofit 4 Activity Tracker with this refined replacement wrist band, Small fits wrists with a circumference of 122-188mm. Large fits wrists with a circumference of 148-215mm, Easy and direct installation and removal. Replacement Bands Only! Garmin device NOT included, Garmin Vivofit 4 Buckle Bracelet. Never lose your Garmin Vivofit 4. Fix the tracker fall off problem, Soft silicone with smooth finish for a sporty look, metal parts made with high quality stainless steel]","[Compatibility, Custom designed for your precious, Garmin Vivofit 4, Activity Tracker, this Garmin Watch Sport Band features a combination of functionality and style. Fit for, Garmin Vivofit 4, Activity Tracker ONLY. NOT for Garmin Vivofit 1/Garmin Vivofit 2/Garmin Vivofit 3., Feature, Material: Silicone. NOTE: Replacement Bands Only! Small fits wrists with a circumference of 122-188mm. Large fits wrists with a circumference of 148-215mm. Models for selection: For Garmin Vivofit 4 Activity Tracker Only. Contracted design style, with you life contracted and not simple., Package Included, Soft Silicone Replacement Watch Band Strap for Garmin Vivofit 4 Activity Tracker (No Tracker)]",QGHXO,"{""Package Dimensions"": ""6.85 x 4.37 x 1.1 inches"", ""Item Weight"": ""2.64 ounces"", ""Item model number"": ""GM-VF4-L14GIRL"", ""Best Sellers Rank"": {""Electronics"": 317736, ""Smart Arm & Wristband Accessories"": 12926}, ""Is Discontinued By Manufacturer"": ""No"", ""Special features"": ""activity tracker"", ""Other display features"": ""Sports"", ""Color"": ""5PCS Bands-Girl"", ""Manufacturer"": ""QGHXO"", ""Date First Available"": ""March 17, 2018""}"


In [None]:
train_df = pd.read_parquet("../data_for_ai/interim/train_sample_interactions_16407u_neg_seq.parquet")
val_df = pd.read_parquet("../data_for_ai/interim/val_sample_interactions_16407u_neg_seq.parquet")
full_df = (
    pd.concat([train_df, val_df], axis=0)
    .assign(timestamp_unix=lambda df: df[args.timestamp_col].astype("int64") // 10**9)
)

In [None]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10.000,1339432870
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13.000,1343873053
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46.000,1347726886
3204,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45.000,1357254525
3208,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39.000,1367803479
...,...,...,...,...,...
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,1626455335
34470392,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423,1610588889
35019360,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874,1638664540
35323250,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732,1645183966


In [None]:
# Load idm
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

4817 items in the dataset


'AE227WAM4NWQPJI33OPN7ZARNNZQ'

## Load features from Feature Store

In [None]:
store = FeatureStore(
    repo_path="../feature_pipeline/feature_store/feature_repo",
)



In [None]:
item_features = [
    "parent_asin_rating_stats_fresh:parent_asin_rating_cnt_365d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_avg_prev_rating_365d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_cnt_90d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_avg_prev_rating_90d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_cnt_30d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_avg_prev_rating_30d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_cnt_7d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_avg_prev_rating_7d",
]

In [None]:
features_df = store.get_historical_features(
    full_df[[args.item_col, args.timestamp_col]].drop_duplicates(), 
    item_features
).to_df()



Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.
CPU times: user 1.87 s, sys: 254 ms, total: 2.12 s
Wall time: 32.4 s


In [None]:
assert features_df.duplicated().sum() == 0, display(features_df.loc[features_df.duplicated()])
features_df.sort_values(args.timestamp_col, ascending=False)

In [None]:
full_features_df = pd.merge(
    full_df, features_df, on=[args.item_col, args.timestamp_col], how="left"
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d
0,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10.000,1339432870,0.0,,0.0,,0.0,,0.0,
1,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13.000,1343873053,3.0,5.000000,1.0,5.000000,1.0,5.00,1.0,5.0
2,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46.000,1347726886,0.0,,0.0,,0.0,,0.0,
3,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45.000,1357254525,4.0,4.250000,2.0,5.000000,2.0,5.00,2.0,5.0
4,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39.000,1367803479,3.0,4.333333,3.0,4.333333,1.0,5.00,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
130866,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,1626455335,3.0,5.000000,0.0,,0.0,,0.0,
130867,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423,1610588889,15.0,3.533333,15.0,3.533333,8.0,3.25,2.0,3.0
130868,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874,1638664540,2.0,5.000000,1.0,5.000000,0.0,,0.0,
130869,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732,1645183966,,,,,,,,


In [None]:
user_features = [
    "user_rating_stats_fresh:user_rating_cnt_90d",
    "user_rating_stats_fresh:user_rating_avg_prev_rating_90d",
    "user_rating_stats_fresh:user_rating_list_10_recent_asin",
    "user_rating_stats_fresh:user_rating_list_10_recent_asin_timestamp",
]

features_df = store.get_historical_features(full_df[[args.user_col, args.timestamp_col]].drop_duplicates(), user_features).to_df()
assert features_df.duplicated().sum() == 0, display(features_df.loc[features_df.duplicated()])

CPU times: total: 0 ns
Wall time: 0 ns


In [None]:
full_features_df = pd.merge(
    full_features_df, features_df, on=[args.user_col, args.timestamp_col], how="left"
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp
0,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10.000,1339432870,0.0,,0.0,,0.0,,0.0,,1.0,,,
1,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13.000,1343873053,3.0,5.000000,1.0,5.000000,1.0,5.00,1.0,5.0,2.0,2.0,B06XKCPK5W,1339432870
2,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46.000,1347726886,0.0,,0.0,,0.0,,0.0,,2.0,3.0,"B06XKCPK5W,B000CKVOOY",13394328701343873053
3,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45.000,1357254525,4.0,4.250000,2.0,5.000000,2.0,5.00,2.0,5.0,1.0,,"B06XKCPK5W,B000CKVOOY,B006GWO5WK",133943287013438730531347726886
4,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39.000,1367803479,3.0,4.333333,3.0,4.333333,1.0,5.00,0.0,,1.0,,"B06XKCPK5W,B000CKVOOY,B006GWO5WK,B008LURQ76",1339432870134387305313477268861357254525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130866,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,1626455335,3.0,5.000000,0.0,,0.0,,0.0,,1.0,,"B074VMZ8JB,B00H8JVFCI,B00KO99GB6,B00CF4G7JC,B0...","1360730296,1432061059,1456243479,1456243487,14..."
130867,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423,1610588889,15.0,3.533333,15.0,3.533333,8.0,3.25,2.0,3.0,1.0,,"B07C8KVP9F,B09SBP9P92,B075X8471B,B0791TX5P5,B0...","1462221728,1488063495,1495905967,1564232045,15..."
130868,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874,1638664540,2.0,5.000000,1.0,5.000000,0.0,,0.0,,1.0,,"B000QKKAPE,B001DKO7R8,B0BMJ89V4P,B0043T7FXE,B0...","1232828952,1305485716,1355137628,1452428556,14..."
130869,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732,1645183966,,,,,,,,,,,,


In [None]:
user_id = full_features_df[args.user_col].iloc[3]
logger.info(f"Eye-balling if the features are correct...")
full_features_df.loc[lambda df: df[args.user_col].eq(user_id)].sort_values(
    args.timestamp_col
)[
    [
        args.user_col,
        args.timestamp_col,
        "timestamp_unix",
        args.item_col,
        "user_rating_list_10_recent_asin",
        "user_rating_list_10_recent_asin_timestamp",
    ]
].head(10)

[32m2025-06-23 12:41:54.269[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mEye-balling if the features are correct...[0m


Unnamed: 0,user_id,timestamp,timestamp_unix,parent_asin,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp
0,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,2012-06-11 16:41:10,1339432870,B06XKCPK5W,,
1,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,2012-08-02 02:04:13,1343873053,B000CKVOOY,B06XKCPK5W,1339432870
2,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,2012-09-15 16:34:46,1347726886,B006GWO5WK,"B06XKCPK5W,B000CKVOOY",13394328701343873053
3,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,2013-01-03 23:08:45,1357254525,B008LURQ76,"B06XKCPK5W,B000CKVOOY,B006GWO5WK",133943287013438730531347726886
4,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,2013-05-06 01:24:39,1367803479,B00AQRUW4Q,"B06XKCPK5W,B000CKVOOY,B006GWO5WK,B008LURQ76",1339432870134387305313477268861357254525
5,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,2013-11-21 19:35:40,1385062540,B00EXK14S0,"B06XKCPK5W,B000CKVOOY,B006GWO5WK,B008LURQ76,B0...","1339432870,1343873053,1347726886,1357254525,13..."
6,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,2014-09-15 00:49:08,1410742148,B00LSYHLR2,"B06XKCPK5W,B000CKVOOY,B006GWO5WK,B008LURQ76,B0...","1339432870,1343873053,1347726886,1357254525,13..."
7,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,2014-10-06 00:33:31,1412555611,B01AS57B0I,"B06XKCPK5W,B000CKVOOY,B006GWO5WK,B008LURQ76,B0...","1339432870,1343873053,1347726886,1357254525,13..."
8,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,2015-01-19 19:19:17,1421695157,B00ON7AX3U,"B06XKCPK5W,B000CKVOOY,B006GWO5WK,B008LURQ76,B0...","1339432870,1343873053,1347726886,1357254525,13..."
9,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,2015-04-27 02:08:40,1430100520,B099RZY28N,"B06XKCPK5W,B000CKVOOY,B006GWO5WK,B008LURQ76,B0...","1339432870,1343873053,1347726886,1357254525,13..."


In [None]:
full_features_df = full_features_df.pipe(idm.map_indices, args.user_col, args.item_col)

In [None]:
def convert_asin_to_idx(inp: str, sequence_length=10, padding_value=-1):
    if inp is None:
        return [padding_value] * sequence_length
    asins = inp.split(",")
    indices = [idm.get_item_index(item_id) for item_id in asins]
    padding_needed = sequence_length - len(indices)
    output = np.pad(
        indices,
        (padding_needed, 0),  # Add padding at the beginning
        "constant",
        constant_values=padding_value,
    )
    return output

In [None]:
    
full_features_df = full_features_df.assign(
    item_sequence=lambda df: df["user_rating_list_10_recent_asin"].apply(
        convert_asin_to_idx
    ),
    item_sequence_ts=lambda df: df["user_rating_list_10_recent_asin_timestamp"].apply(
        pad_timestamp_sequence
    ),
    item_sequence_ts_bucket=lambda df: df.apply(calc_sequence_timestamp_bucket, axis=1),
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,parent_asin_rating_avg_prev_rating_7d,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp,user_indice,item_indice,item_sequence,item_sequence_ts,item_sequence_ts_bucket
0,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10.000,1339432870,0.0,,0.0,,0.0,...,,1.0,,,,3931,2905,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
1,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13.000,1343873053,3.0,5.000000,1.0,5.000000,1.0,...,5.0,2.0,2.0,B06XKCPK5W,1339432870,3931,89,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2905]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1339432870]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 5]"
2,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46.000,1347726886,0.0,,0.0,,0.0,...,,2.0,3.0,"B06XKCPK5W,B000CKVOOY",13394328701343873053,3931,758,"[-1, -1, -1, -1, -1, -1, -1, -1, 2905, 89]","[-1, -1, -1, -1, -1, -1, -1, -1, 1339432870, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 5, 5]"
3,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45.000,1357254525,4.0,4.250000,2.0,5.000000,2.0,...,5.0,1.0,,"B06XKCPK5W,B000CKVOOY,B006GWO5WK",133943287013438730531347726886,3931,959,"[-1, -1, -1, -1, -1, -1, -1, 2905, 89, 758]","[-1, -1, -1, -1, -1, -1, -1, 1339432870, 13438...","[-1, -1, -1, -1, -1, -1, -1, 5, 5, 5]"
4,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39.000,1367803479,3.0,4.333333,3.0,4.333333,1.0,...,,1.0,,"B06XKCPK5W,B000CKVOOY,B006GWO5WK,B008LURQ76",1339432870134387305313477268861357254525,3931,1096,"[-1, -1, -1, -1, -1, -1, 2905, 89, 758, 959]","[-1, -1, -1, -1, -1, -1, 1339432870, 134387305...","[-1, -1, -1, -1, -1, -1, 5, 5, 5, 5]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130866,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,1626455335,3.0,5.000000,0.0,,0.0,...,,1.0,,"B074VMZ8JB,B00H8JVFCI,B00KO99GB6,B00CF4G7JC,B0...","1360730296,1432061059,1456243479,1456243487,14...",14144,4693,"[-1, -1, 3051, 1472, 1662, 1213, 819, 665, 261...","[-1, -1, 1360730296, 1432061059, 1456243479, 1...","[-1, -1, 8, 8, 8, 8, 8, 8, 7, 7]"
130867,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423,1610588889,15.0,3.533333,15.0,3.533333,8.0,...,3.0,1.0,,"B07C8KVP9F,B09SBP9P92,B075X8471B,B0791TX5P5,B0...","1462221728,1488063495,1495905967,1564232045,15...",7343,3923,"[-1, -1, -1, -1, -1, 3290, 4295, 3089, 3188, 3...","[-1, -1, -1, -1, -1, 1462221728, 1488063495, 1...","[-1, -1, -1, -1, -1, 7, 7, 7, 6, 6]"
130868,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874,1638664540,2.0,5.000000,1.0,5.000000,0.0,...,,1.0,,"B000QKKAPE,B001DKO7R8,B0BMJ89V4P,B0043T7FXE,B0...","1232828952,1305485716,1355137628,1452428556,14...",5497,4335,"[-1, -1, -1, 137, 209, 4564, 487, 1049, 1652, ...","[-1, -1, -1, 1232828952, 1305485716, 135513762...","[-1, -1, -1, 9, 9, 8, 8, 8, 7, 6]"
130869,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732,1645183966,,,,,,...,,,,,,6427,4147,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"


In [None]:
# Split back

val_timestamp = pd.to_datetime(val_df[args.timestamp_col].astype(int)).min()
train_df_length = train_df.shape[0]
train_df = full_features_df.loc[lambda df: df["timestamp"].lt(val_timestamp)]
assert train_df.shape[0] == train_df_length

val_df_length = val_df.shape[0]
val_df = full_features_df.loc[lambda df: df["timestamp"].ge(val_timestamp)]
assert val_df.shape[0] == val_df_length

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler


def reshape_2d_to_1d(X):
    """
    Ensure the shape is working for TFIDF pipeline.
    """
    return np.array(X).reshape(-1)


def flatten_string_array_col(X):
    """
    The inputs contain columns with list of sentences. To properly analyze them we would flatten them.
    """
    assert isinstance(X, pd.Series)
    output = X.fillna("").str.join("\n")
    assert X.shape[0] == output.shape[0]
    return output.values


def todense(X):
    return np.asarray(X.todense())


def title_pipeline_steps():
    steps = [
        ("impute", SimpleImputer(strategy="constant", fill_value="")),
        ("reshape", FunctionTransformer(reshape_2d_to_1d, validate=False)),
        ("tfidf", TfidfVectorizer(min_df=5, max_features=1000, ngram_range=(1, 2))),
        ("todense", FunctionTransformer(todense, validate=False)),
    ]
    return steps


def description_pipeline_steps():
    steps = [
        (
            "flatten_string_array_col",
            FunctionTransformer(flatten_string_array_col, validate=False),
        ),
        ("tfidf", TfidfVectorizer(min_df=5, max_features=1000, ngram_range=(1, 2))),
        ("todense", FunctionTransformer(todense, validate=False)),
    ]
    return steps


def tokenizer(s):
    return s.split("\n")


def categories_pipeline_steps():
    steps = [
        (
            "flatten_string_array_col",
            FunctionTransformer(flatten_string_array_col, validate=False),
        ),
        ("count_vect", CountVectorizer(tokenizer=tokenizer, token_pattern=None)),
        ("todense", FunctionTransformer(todense, validate=False)),
    ]
    return steps


def price_parse_dtype(series, pattern):
    return series.str.extract(pattern).astype(float)


def price_pipeline_steps(price_pattern=None):
    if price_pattern is None:
        price_pattern = r"\b((?:\d+\.\d*)|(?:\d+))\b"
    steps = [
        (
            "extract_price",
            FunctionTransformer(
                price_parse_dtype, kw_args=dict(pattern=price_pattern), validate=False
            ),
        ),
        ("impute", SimpleImputer(strategy="constant", fill_value=0)),
        ("min_max_scale", MinMaxScaler()),
    ]
    return steps


def rating_agg_pipeline_steps():
    steps = [
        ("impute", SimpleImputer(strategy="constant", fill_value=0)),
        ("normalize", StandardScaler()),
    ]
    return steps


In [None]:
# Define the transformations for the columns
rating_agg_cols = [feature.split(":")[1] for feature in item_features]

tfm = [
    ("main_category", OneHotEncoder(handle_unknown="ignore"), ["main_category"]),
    ("title", Pipeline(title_pipeline_steps()), ["title"]),
    ("description", Pipeline(description_pipeline_steps()), "description"),
    (
        "categories",
        Pipeline(categories_pipeline_steps()),
        "categories",
    ),  # Count Vectorizer for multi-label categorical
    (
        "price",
        Pipeline(price_pipeline_steps()),
        "price",
    ),  # Normalizing price
    # (
    #     "rating_agg",
    #     Pipeline(rating_agg_pipeline_steps()),
    #     rating_agg_cols,
    # ),
]
meta_cols = ["main_category", "title", "description", "categories", "price"]
cols = meta_cols + rating_agg_cols
cols

['main_category',
 'title',
 'description',
 'categories',
 'price',
 'parent_asin_rating_cnt_365d',
 'parent_asin_rating_avg_prev_rating_365d',
 'parent_asin_rating_cnt_90d',
 'parent_asin_rating_avg_prev_rating_90d',
 'parent_asin_rating_cnt_30d',
 'parent_asin_rating_avg_prev_rating_30d',
 'parent_asin_rating_cnt_7d',
 'parent_asin_rating_avg_prev_rating_7d']

In [None]:
def check_dup(df):
    assert (
        df[[args.user_col, args.item_col, args.timestamp_col]].duplicated().sum() == 0
    )

In [None]:
# Merge the item features into the interaction data
train_features_df = pd.merge(
    train_df, metadata_raw_df[[args.item_col] + meta_cols], how="left", on=args.item_col
)
val_features_df = pd.merge(
    val_df, metadata_raw_df[[args.item_col] + meta_cols], how="left", on=args.item_col
)
check_dup(train_features_df)
check_dup(val_features_df)
train_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,user_indice,item_indice,item_sequence,item_sequence_ts,item_sequence_ts_bucket,main_category,title,description,categories,price
0,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10,1339432870,0.0,,0.0,,0.0,...,3931,2905,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",Computers,Targus CleanVu Cleaning Pads Touch Screen Devi...,[The Targus CleanVu Cleaning Pads for iPad off...,"[Electronics, Television & Video, Accessories,...",6.99
1,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13,1343873053,3.0,5.000000,1.0,5.000000,1.0,...,3931,89,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2905]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1339432870]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 5]",Computers,Arkon Folding Tablet Stand Compatible with App...,[IPM-TAB1 is a highly versatile folding tablet...,"[Electronics, Computers & Accessories, Tablet ...",14.95
2,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46,1347726886,0.0,,0.0,,0.0,...,3931,758,"[-1, -1, -1, -1, -1, -1, -1, -1, 2905, 89]","[-1, -1, -1, -1, -1, -1, -1, -1, 1339432870, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 5, 5]",Amazon Devices,Amazon Kindle 9W PowerFast Adapter for Acceler...,[],"[Electronics, Computers & Accessories, Tablet ...",
3,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45,1357254525,4.0,4.250000,2.0,5.000000,2.0,...,3931,959,"[-1, -1, -1, -1, -1, -1, -1, 2905, 89, 758]","[-1, -1, -1, -1, -1, -1, -1, 1339432870, 13438...","[-1, -1, -1, -1, -1, -1, -1, 5, 5, 5]",Computers,amFilm (TM) Premium Screen Protector Film Clea...,[Introducing amFilm® Premium Screen Protector ...,"[Electronics, Computers & Accessories, Tablet ...",
4,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39,1367803479,3.0,4.333333,3.0,4.333333,1.0,...,3931,1096,"[-1, -1, -1, -1, -1, -1, 2905, 89, 758, 959]","[-1, -1, -1, -1, -1, -1, 1339432870, 134387305...","[-1, -1, -1, -1, -1, -1, 5, 5, 5, 5]",Computers,Seagate Wireless Plus 1TB Portable Hard Drive ...,"[With Seagate Wireless Plus, you can enjoy you...","[Electronics, Computers & Accessories, Data St...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127387,AFB4DWWKZBQFS22FAWDEP37EL2FA,B00KAF5RQ2,5.0,2016-02-22 17:44:10,1456163050,3.0,2.333333,0.0,,0.0,...,5059,1634,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",Camera & Photo,STK EN-EL14 EN-EL14a Battery Pack for Nikon D3...,[The STK Nikon EN-EL14 battery is 100% compati...,"[Electronics, Camera & Photo, Accessories, Bat...",14.99
127388,AFB4DWWKZBQFS22FAWDEP37EL2FA,B001F6TXME,5.0,2016-02-22 17:44:40,1456163080,3.0,5.000000,2.0,5.000000,1.0,...,5059,219,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1634]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1456163050]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 0]",Camera & Photo,Nikon 25395 MC-DC2 Remote Release Cord (1 Meter),[Nikon MC-DC2 Remote Release Cord for Nikon Di...,"[Electronics, Camera & Photo, Accessories, Dig...",26.95
127389,AFB4DWWKZBQFS22FAWDEP37EL2FA,B007VGGIB6,5.0,2016-02-22 17:45:10,1456163110,7.0,2.714286,3.0,2.666667,0.0,...,5059,864,"[-1, -1, -1, -1, -1, -1, -1, -1, 1634, 219]","[-1, -1, -1, -1, -1, -1, -1, -1, 1456163050, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 0, 0]",Camera & Photo,Nikon WU-1a Wireless Mobile Adapter 27081 for ...,[The WU-1a is compatible with the following Ni...,"[Electronics, Camera & Photo, Accessories, Bat...",98.89
127390,AFB4DWWKZBQFS22FAWDEP37EL2FA,B00WUID73W,5.0,2016-02-22 17:45:37,1456163137,7.0,4.000000,0.0,,0.0,...,5059,2113,"[-1, -1, -1, -1, -1, -1, -1, 1634, 219, 864]","[-1, -1, -1, -1, -1, -1, -1, 1456163050, 14561...","[-1, -1, -1, -1, -1, -1, -1, 0, 0, 0]",Camera & Photo,eForCity Leather Hand Grip Strap Compatible wi...,[Product Description Cushioned camera grips fi...,"[Electronics, Camera & Photo, Bags & Cases, Bi...",


In [None]:
# papermill_description=fit-tfm-pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=tfm, remainder="drop"  # Drop any columns not specified in transformers
)

# Create a pipeline object
item_metadata_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessing_pipeline),
        (
            "normalizer",
            StandardScaler(),
        ),  # Normalize the numerical outputs since it's an important preconditions for any Deep Learning models
    ]
)

# Fit the pipeline
# Drop duplicated item so that the Pipeline only fit the unique item features
fit_df = train_features_df.drop_duplicates(subset=[args.item_col])
item_metadata_pipeline.fit(fit_df)

In [None]:
# Reclaim memory
import gc

del fit_df
del train_df
del val_df
gc.collect()

819

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import issparse
from tqdm.auto import tqdm


def chunk_transform(df, pipeline, chunk_size=1000):
    transformed_chunks = []

    progress_bar = tqdm(range(0, df.shape[0], chunk_size), desc="Transforming chunks")

    # Iterate through the DataFrame in chunks
    for start in progress_bar:
        end = min(start + chunk_size, df.shape[0])
        chunk_df = df.iloc[start:end]

        # Apply the pipeline transformation to the chunk
        transformed_chunk = pipeline.transform(chunk_df)

        # Check if the transformed output is sparse, and convert to dense
        if issparse(transformed_chunk):
            transformed_chunk = transformed_chunk.toarray()

        # Collect the transformed chunk
        transformed_chunks.append(transformed_chunk)

    # Concatenate the transformed chunks into a single NumPy array
    transformed_full = np.vstack(transformed_chunks)

    return transformed_full


def parse_dt(df, cols=["timestamp"]):
    return df.assign(
        **{
            col: lambda df: pd.to_datetime(df[col].astype(int), unit="ms")
            for col in cols
        }
    )


def handle_dtypes(df):
    return df.assign(rating=lambda df: df["rating"].astype(float))


In [None]:
# papermill_description=chunk-transform
# Transform the data in chunks to avoid OOM
transformed_item_metadata = chunk_transform(
    train_features_df, item_metadata_pipeline, chunk_size=args.tfm_chunk_size
)

logger.info(f"Transformed Item Metadata Shape: {transformed_item_metadata.shape}")

Transforming chunks:   0%|          | 0/26 [00:00<?, ?it/s]

[32m2025-06-23 13:53:01.067[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mTransformed Item Metadata Shape: (127392, 626)[0m


In [None]:
logger.info(f"Checking stats...")
transformed_df_stats = (
    pd.DataFrame(transformed_item_metadata)
    .sample(10000)
    .T.assign(mean=lambda df: df.mean(axis=1), std=lambda df: df.std(axis=1))[
        ["mean", "std"]
    ]
)
transformed_df_stats

[32m2025-06-23 13:54:04.991[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mChecking stats...[0m


Unnamed: 0,mean,std
0,-0.000914,0.981541
1,-0.007469,0.694083
2,-0.018925,0.987723
3,0.259204,1.462195
4,-0.007391,0.878011
...,...,...
621,0.251963,0.898281
622,0.964547,3.576236
623,0.362030,1.054229
624,0.727273,3.153834


In [None]:
px.scatter(transformed_df_stats, x="mean", y="std")

In [None]:
assert (
    -1 < transformed_df_stats["mean"].mean() < 1
), "Transformed mean is not centerred at 0"
assert (
    0 < transformed_df_stats["std"].mean() < 2
), "Transformed avg is not centerred at 1"

In [None]:
with open("../data_for_ai/interim/item_metadata_pipeline_wo_user_item_manipulate.dill", "wb") as f:
    dill.dump(item_metadata_pipeline, f)

In [None]:
with open("../data_for_ai/interim/item_metadata_pipeline_wo_user_item_manipulate.dill", "rb") as f:
    item_metadata_pipeline = dill.load(f)

In [None]:
train_persist_fp = "../data_for_ai/interim/train_sample_interactions_16407u_features.parquet"
val_persist_fp = "../data_for_ai/interim/val_sample_interactions_16407u_features.parquet"

train_features_df.to_parquet(train_persist_fp, index=False)
val_features_df.to_parquet(val_persist_fp, index=False)