# Feature engineering

## Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
sys.path.insert(0, "..")

import dill
import numpy as np
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from src.algo.ranker.utils import calc_sequence_timestamp_bucket

from src.utils.embedding_id_mapper import IDMapper
from src.algo.ranker.utils import pad_timestamp_sequence


In [3]:
class Args(BaseModel):
    run_name: str = "000-prep-data"
    testing: bool = True
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    tfm_chunk_size: int = 5000

    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "000-prep-data",
  "testing": true,
  "notebook_persist_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/000-prep-data",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "tfm_chunk_size": 5000,
  "sequence_length": 10
}


## Load data

In [4]:
metadata_raw = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Electronics", trust_remote_code=True
)
metadata_raw_df = metadata_raw["full"].to_pandas()
metadata_raw_df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Electronics,FS-1051 FATSHARK TELEPORTER V3 HEADSET,3.5,6,[],[Teleporter V3 The “Teleporter V3” kit sets a ...,,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Fat Shark,"[Electronics, Television & Video, Video Glasses]","{""Date First Available"": ""August 2, 2014"", ""Ma...",B00MCW7G9M,,,
1,All Electronics,Ce-H22B12-S1 4Kx2K Hdmi 4Port,5.0,1,"[UPC: 662774021904, Weight: 0.600 lbs]",[HDMI In - HDMI Out],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",SIIG,"[Electronics, Television & Video, Accessories,...","{""Product Dimensions"": ""0.83 x 4.17 x 2.05 inc...",B00YT6XQSE,,,
2,Computers,Digi-Tatoo Decal Skin Compatible With MacBook ...,4.5,246,[WARNING: Please IDENTIFY MODEL NUMBER on the ...,[],19.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['AL 2Sides Video', 'MacBook Protect...",Digi-Tatoo,"[Electronics, Computers & Accessories, Laptop ...","{""Brand"": ""Digi-Tatoo"", ""Color"": ""Fresh Marble...",B07SM135LS,,,
3,AMAZON FASHION,NotoCity Compatible with Vivoactive 4 band 22m...,4.5,233,[☛NotoCity 22mm band is designed for Vivoactiv...,[],9.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",NotoCity,"[Electronics, Wearable Technology, Clips, Arm ...","{""Date First Available"": ""May 29, 2020"", ""Manu...",B089CNGZCW,,,
4,Cell Phones & Accessories,Motorola Droid X Essentials Combo Pack,3.8,64,"[New Droid X Essentials Combo Pack, Exclusive ...",[all Genuine High Quality Motorola Made Access...,14.99,"{'hi_res': [None, None, None, None, None], 'la...","{'title': [], 'url': [], 'user_id': []}",Verizon,"[Electronics, Computers & Accessories, Compute...","{""Product Dimensions"": ""11.6 x 6.9 x 3.1 inche...",B004E2Z88O,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1610007,Computers,"Wintec FileMate Pro USB Flash Drive, 3FMUSB32G...",5.0,1,"[32GB / 32 GB file storage, USB mass storage d...",[--New in retail packaging --Fast USB 2.0 data...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Wintec Industries,"[Electronics, Computers & Accessories, Data St...","{""Product Dimensions"": ""0.78 x 0.31 x 2.75 inc...",B003NUIU9M,,,
1610008,,Tsugar Noise Reduction Wireless Headphones Blu...,1.0,2,[High Fidelity Sound: Intelligent noise reduct...,[Description: 100% brand new high quality 1.Hi...,,"{'hi_res': [None, 'https://m.media-amazon.com/...","{'title': [], 'url': [], 'user_id': []}",Tsugar,"[Electronics, Headphones, Earbuds & Accessorie...","{""Best Sellers Rank"": {""Electronics"": 547760, ...",B0BHVY33TL,,,
1610009,,"Hardshell Case for MacBook Pro (16-inch, 2021)...",4.6,11,"[Compatible with MacBook Pro 16-inch (2021), I...",[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Incase Designs,"[Electronics, Computers & Accessories, Laptop ...","{""Product Dimensions"": ""9.88 x 0.94 x 14.13 in...",B09SQGRFFH,,,
1610010,Computers,"FYY 12-13.3"" Laptop Sleeve Case Bag, PU Leathe...",4.0,35,[【Compatibility】FYY laptop Bag sleeve perfect ...,[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",FYY,"[Electronics, Computers & Accessories, Laptop ...","{""Standing screen display size"": ""12.3 Inches""...",B091JWCSG5,,,


In [5]:
with pd.option_context("display.max_colwidth", None):
    display(
        metadata_raw_df.iloc[[6]][
            [
                "title",
                "main_category",
                "categories",
                "features",
                "description",
                "store",
                "details",
            ]
        ]
    )

Unnamed: 0,title,main_category,categories,features,description,store,details
6,"QGHXO Band for Garmin Vivofit 4, Soft Silicone Replacement Watch Band Strap for Garmin Vivofit 4 Activity Tracker, Small, Large, Ten Colors (5PCS Bands-Girl, Large)",Cell Phones & Accessories,"[Electronics, Wearable Technology, Arm & Wristband Accessories]","[Personalized Your Garmin Vivofit 4 Activity Tracker with this refined replacement wrist band, Small fits wrists with a circumference of 122-188mm. Large fits wrists with a circumference of 148-215mm, Easy and direct installation and removal. Replacement Bands Only! Garmin device NOT included, Garmin Vivofit 4 Buckle Bracelet. Never lose your Garmin Vivofit 4. Fix the tracker fall off problem, Soft silicone with smooth finish for a sporty look, metal parts made with high quality stainless steel]","[Compatibility, Custom designed for your precious, Garmin Vivofit 4, Activity Tracker, this Garmin Watch Sport Band features a combination of functionality and style. Fit for, Garmin Vivofit 4, Activity Tracker ONLY. NOT for Garmin Vivofit 1/Garmin Vivofit 2/Garmin Vivofit 3., Feature, Material: Silicone. NOTE: Replacement Bands Only! Small fits wrists with a circumference of 122-188mm. Large fits wrists with a circumference of 148-215mm. Models for selection: For Garmin Vivofit 4 Activity Tracker Only. Contracted design style, with you life contracted and not simple., Package Included, Soft Silicone Replacement Watch Band Strap for Garmin Vivofit 4 Activity Tracker (No Tracker)]",QGHXO,"{""Package Dimensions"": ""6.85 x 4.37 x 1.1 inches"", ""Item Weight"": ""2.64 ounces"", ""Item model number"": ""GM-VF4-L14GIRL"", ""Best Sellers Rank"": {""Electronics"": 317736, ""Smart Arm & Wristband Accessories"": 12926}, ""Is Discontinued By Manufacturer"": ""No"", ""Special features"": ""activity tracker"", ""Other display features"": ""Sports"", ""Color"": ""5PCS Bands-Girl"", ""Manufacturer"": ""QGHXO"", ""Date First Available"": ""March 17, 2018""}"


In [6]:
train_df = pd.read_parquet("../data_for_ai/interim/train_sample_interactions_16407u_seq.parquet")
val_df = pd.read_parquet("../data_for_ai/interim/val_sample_interactions_16407u_seq.parquet")
full_df = (
    pd.concat([train_df, val_df], axis=0)
    .assign(timestamp_unix=lambda df: df[args.timestamp_col].astype("int64") // 10**9)
)

In [7]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,timestamp_unix
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15.000,8071,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1043292495
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09.000,7935,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1069783929
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57.000,13705,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1087524177
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B00008SCFL,4.0,2004-09-13 20:18:44.000,12730,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1095106724
4,AEX3L4NKDESOCGWOFNF63GRFGXCA,B00WUI8JN0,5.0,2004-10-22 14:26:12.000,3735,2103,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1098455172
...,...,...,...,...,...,...,...,...
3474,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,B07QN33986,5.0,2022-02-19 01:32:51.519,2171,3626,"[-1, -1, 2627, 4216, 4743, 1945, 2355, 1831, 9...",1645234371
3475,AFBTD25HPE4BE4LUFV3DTI2E2N2A,B07TMJ8S5Z,5.0,2022-02-19 16:49:57.966,5159,3699,"[-1, -1, -1, -1, 2260, 3517, 3609, 3495, 3625,...",1645289397
3476,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B00007KDX6,5.0,2022-02-19 16:56:53.030,1396,32,"[-1, 99, 1265, 3514, 2271, 3451, 3827, 2504, 4...",1645289813
3477,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,5.0,2022-02-19 17:28:55.519,14550,4772,"[-1, -1, -1, -1, -1, 1812, 4165, 4575, 4807, 374]",1645291735


In [8]:
# Load idm
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

4817 items in the dataset


'AE227WAM4NWQPJI33OPN7ZARNNZQ'

## Load features from Feature Store

In [9]:
store = FeatureStore(
    repo_path="../feature_pipeline/feature_store/feature_repo",
)

In [10]:
item_features = [
    "parent_asin_rating_stats_fresh:parent_asin_rating_cnt_365d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_avg_prev_rating_365d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_cnt_90d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_avg_prev_rating_90d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_cnt_30d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_avg_prev_rating_30d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_cnt_7d",
    "parent_asin_rating_stats_fresh:parent_asin_rating_avg_prev_rating_7d",
]

In [11]:
features_df = store.get_historical_features(
    full_df[[args.item_col, args.timestamp_col]].drop_duplicates(), 
    item_features
).to_df()



Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.


In [12]:
assert features_df.duplicated().sum() == 0, display(features_df.loc[features_df.duplicated()])
features_df.sort_values(args.timestamp_col, ascending=False)

Unnamed: 0,parent_asin,timestamp,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d
130870,B091K4WYD1,2022-02-19 22:08:53.253,,,,,,,,
130869,B0C682GZ5X,2022-02-19 17:28:55.519,,,,,,,,
32798,B00007KDX6,2022-02-19 16:56:53.030,,,,,,,,
97990,B07TMJ8S5Z,2022-02-19 16:49:57.966,,,,,,,,
32797,B07QN33986,2022-02-19 01:32:51.519,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
32800,B00WUI8JN0,2004-10-22 14:26:12.000,0.0,,0.0,,0.0,,0.0,
65222,B00008SCFL,2004-09-13 20:18:44.000,1.0,5.0,0.0,,0.0,,0.0,
65221,B07KQWX947,2004-06-18 02:02:57.000,0.0,,0.0,,0.0,,0.0,
32799,B00008SCFL,2003-11-25 18:12:09.000,0.0,,0.0,,0.0,,0.0,


In [13]:
full_features_df = pd.merge(
    full_df, features_df, on=[args.item_col, args.timestamp_col], how="left"
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15.000,8071,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1043292495,0.0,,0.0,,0.0,,0.0,
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09.000,7935,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1069783929,0.0,,0.0,,0.0,,0.0,
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57.000,13705,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1087524177,0.0,,0.0,,0.0,,0.0,
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B00008SCFL,4.0,2004-09-13 20:18:44.000,12730,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1095106724,1.0,5.0,0.0,,0.0,,0.0,
4,AEX3L4NKDESOCGWOFNF63GRFGXCA,B00WUI8JN0,5.0,2004-10-22 14:26:12.000,3735,2103,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1098455172,0.0,,0.0,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130866,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,B07QN33986,5.0,2022-02-19 01:32:51.519,2171,3626,"[-1, -1, 2627, 4216, 4743, 1945, 2355, 1831, 9...",1645234371,,,,,,,,
130867,AFBTD25HPE4BE4LUFV3DTI2E2N2A,B07TMJ8S5Z,5.0,2022-02-19 16:49:57.966,5159,3699,"[-1, -1, -1, -1, 2260, 3517, 3609, 3495, 3625,...",1645289397,,,,,,,,
130868,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B00007KDX6,5.0,2022-02-19 16:56:53.030,1396,32,"[-1, 99, 1265, 3514, 2271, 3451, 3827, 2504, 4...",1645289813,,,,,,,,
130869,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,5.0,2022-02-19 17:28:55.519,14550,4772,"[-1, -1, -1, -1, -1, 1812, 4165, 4575, 4807, 374]",1645291735,,,,,,,,


In [14]:
user_features = [
    "user_rating_stats_fresh:user_rating_cnt_90d",
    "user_rating_stats_fresh:user_rating_avg_prev_rating_90d",
    "user_rating_stats_fresh:user_rating_list_10_recent_asin",
    "user_rating_stats_fresh:user_rating_list_10_recent_asin_timestamp",
]

features_df = store.get_historical_features(full_df[[args.user_col, args.timestamp_col]].drop_duplicates(), user_features).to_df()
assert features_df.duplicated().sum() == 0, display(features_df.loc[features_df.duplicated()])

Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.


In [15]:
full_features_df = pd.merge(
    full_features_df, features_df, on=[args.user_col, args.timestamp_col], how="left"
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15.000,8071,4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1043292495,0.0,,0.0,,0.0,,0.0,,1.0,,,
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09.000,7935,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1069783929,0.0,,0.0,,0.0,,0.0,,1.0,,,
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57.000,13705,3514,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1087524177,0.0,,0.0,,0.0,,0.0,,1.0,,,
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B00008SCFL,4.0,2004-09-13 20:18:44.000,12730,36,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1095106724,1.0,5.0,0.0,,0.0,,0.0,,1.0,,,
4,AEX3L4NKDESOCGWOFNF63GRFGXCA,B00WUI8JN0,5.0,2004-10-22 14:26:12.000,3735,2103,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1098455172,0.0,,0.0,,0.0,,0.0,,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130866,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,B07QN33986,5.0,2022-02-19 01:32:51.519,2171,3626,"[-1, -1, 2627, 4216, 4743, 1945, 2355, 1831, 9...",1645234371,,,,,,,,,,,,
130867,AFBTD25HPE4BE4LUFV3DTI2E2N2A,B07TMJ8S5Z,5.0,2022-02-19 16:49:57.966,5159,3699,"[-1, -1, -1, -1, 2260, 3517, 3609, 3495, 3625,...",1645289397,,,,,,,,,,,,
130868,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B00007KDX6,5.0,2022-02-19 16:56:53.030,1396,32,"[-1, 99, 1265, 3514, 2271, 3451, 3827, 2504, 4...",1645289813,,,,,,,,,,,,
130869,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,5.0,2022-02-19 17:28:55.519,14550,4772,"[-1, -1, -1, -1, -1, 1812, 4165, 4575, 4807, 374]",1645291735,,,,,,,,,,,,


In [16]:
user_id = full_features_df[args.user_col].iloc[3]
logger.info(f"Eye-balling if the features are correct...")
full_features_df.loc[lambda df: df[args.user_col].eq(user_id)].sort_values(
    args.timestamp_col
)[
    [
        args.user_col,
        args.timestamp_col,
        "timestamp_unix",
        args.item_col,
        "user_rating_list_10_recent_asin",
        "user_rating_list_10_recent_asin_timestamp",
    ]
].head(10)

[32m2025-07-01 00:11:00.846[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mEye-balling if the features are correct...[0m


Unnamed: 0,user_id,timestamp,timestamp_unix,parent_asin,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,2004-09-13 20:18:44,1095106724,B00008SCFL,,
199,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,2008-07-11 23:57:54,1215820674,B000C1Z0HA,B00008SCFL,1095106724
278,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,2008-12-27 21:30:54,1230413454,B00KR4XB64,"B00008SCFL,B000C1Z0HA",10951067241215820674
489,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,2009-10-11 17:01:28,1255280488,B002FU5QMK,"B00008SCFL,B000C1Z0HA,B00KR4XB64",109510672412158206741230413454
1941,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,2011-08-18 17:27:24,1313688444,B0054L8N7M,"B00008SCFL,B000C1Z0HA,B00KR4XB64,B002FU5QMK",1095106724121582067412304134541255280488
2171,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,2011-10-14 18:51:10,1318618270,B005IMB5SG,"B00008SCFL,B000C1Z0HA,B00KR4XB64,B002FU5QMK,B0...","1095106724,1215820674,1230413454,1255280488,13..."
2548,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,2011-12-15 06:18:29,1323929909,B01FU08V64,"B00008SCFL,B000C1Z0HA,B00KR4XB64,B002FU5QMK,B0...","1095106724,1215820674,1230413454,1255280488,13..."
3535,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,2012-05-17 18:50:30,1337280630,B00WMM48Q6,"B00008SCFL,B000C1Z0HA,B00KR4XB64,B002FU5QMK,B0...","1095106724,1215820674,1230413454,1255280488,13..."
3756,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,2012-06-18 08:01:41,1340006501,B0031GK3OI,"B00008SCFL,B000C1Z0HA,B00KR4XB64,B002FU5QMK,B0...","1095106724,1215820674,1230413454,1255280488,13..."
5414,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,2012-12-14 22:01:08,1355522468,B008J0Z9TA,"B00008SCFL,B000C1Z0HA,B00KR4XB64,B002FU5QMK,B0...","1095106724,1215820674,1230413454,1255280488,13..."


In [17]:
full_features_df = full_features_df.pipe(idm.map_indices, args.user_col, args.item_col)

In [18]:
def convert_asin_to_idx(inp: str, sequence_length=10, padding_value=-1):
    if inp is None:
        return [padding_value] * sequence_length
    asins = inp.split(",")
    indices = [idm.get_item_index(item_id) for item_id in asins]
    padding_needed = sequence_length - len(indices)
    output = np.pad(
        indices,
        (padding_needed, 0),  # Add padding at the beginning
        "constant",
        constant_values=padding_value,
    )
    return output

In [19]:
    
full_features_df = full_features_df.assign(
    item_sequence=lambda df: df["user_rating_list_10_recent_asin"].apply(
        convert_asin_to_idx
    ),
    item_sequence_ts=lambda df: df["user_rating_list_10_recent_asin_timestamp"].apply(
        pad_timestamp_sequence
    ),
    item_sequence_ts_bucket=lambda df: df.apply(calc_sequence_timestamp_bucket, axis=1),
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,...,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp,item_sequence_ts,item_sequence_ts_bucket
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15.000,8071,4,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1043292495,0.0,,...,0.0,,0.0,,1.0,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09.000,7935,36,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1069783929,0.0,,...,0.0,,0.0,,1.0,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57.000,13705,3514,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1087524177,0.0,,...,0.0,,0.0,,1.0,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B00008SCFL,4.0,2004-09-13 20:18:44.000,12730,36,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1095106724,1.0,5.0,...,0.0,,0.0,,1.0,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
4,AEX3L4NKDESOCGWOFNF63GRFGXCA,B00WUI8JN0,5.0,2004-10-22 14:26:12.000,3735,2103,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1098455172,0.0,,...,0.0,,0.0,,1.0,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130866,AEKUF6AOVWDWFYOKPWO2CV72PEDQ,B07QN33986,5.0,2022-02-19 01:32:51.519,2171,3626,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1645234371,,,...,,,,,,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
130867,AFBTD25HPE4BE4LUFV3DTI2E2N2A,B07TMJ8S5Z,5.0,2022-02-19 16:49:57.966,5159,3699,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1645289397,,,...,,,,,,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
130868,AEEQZRQBOFHFBFPYBX2BZ5WOI33A,B00007KDX6,5.0,2022-02-19 16:56:53.030,1396,32,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1645289813,,,...,,,,,,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
130869,AHLN6GKTKZE22AON34YAQXTGK63A,B0C682GZ5X,5.0,2022-02-19 17:28:55.519,14550,4772,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1645291735,,,...,,,,,,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"


In [20]:
# Split back

val_timestamp = pd.to_datetime(val_df[args.timestamp_col].astype(int)).min()
train_df_length = train_df.shape[0]
train_df = full_features_df.loc[lambda df: df["timestamp"].lt(val_timestamp)]
assert train_df.shape[0] == train_df_length

val_df_length = val_df.shape[0]
val_df = full_features_df.loc[lambda df: df["timestamp"].ge(val_timestamp)]
assert val_df.shape[0] == val_df_length

In [21]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler


def reshape_2d_to_1d(X):
    """
    Ensure the shape is working for TFIDF pipeline.
    """
    return np.array(X).reshape(-1)


def flatten_string_array_col(X):
    """
    The inputs contain columns with list of sentences. To properly analyze them we would flatten them.
    """
    assert isinstance(X, pd.Series)
    output = X.fillna("").str.join("\n")
    assert X.shape[0] == output.shape[0]
    return output.values


def todense(X):
    return np.asarray(X.todense())


def title_pipeline_steps():
    steps = [
        ("impute", SimpleImputer(strategy="constant", fill_value="")),
        ("reshape", FunctionTransformer(reshape_2d_to_1d, validate=False)),
        ("tfidf", TfidfVectorizer(min_df=5, max_features=1000, ngram_range=(1, 2))),
        ("todense", FunctionTransformer(todense, validate=False)),
    ]
    return steps


def description_pipeline_steps():
    steps = [
        (
            "flatten_string_array_col",
            FunctionTransformer(flatten_string_array_col, validate=False),
        ),
        ("tfidf", TfidfVectorizer(min_df=5, max_features=1000, ngram_range=(1, 2))),
        ("todense", FunctionTransformer(todense, validate=False)),
    ]
    return steps


def tokenizer(s):
    return s.split("\n")


def categories_pipeline_steps():
    steps = [
        (
            "flatten_string_array_col",
            FunctionTransformer(flatten_string_array_col, validate=False),
        ),
        ("count_vect", CountVectorizer(tokenizer=tokenizer, token_pattern=None)),
        ("todense", FunctionTransformer(todense, validate=False)),
    ]
    return steps


def price_parse_dtype(series, pattern):
    return series.str.extract(pattern).astype(float)


def price_pipeline_steps(price_pattern=None):
    if price_pattern is None:
        price_pattern = r"\b((?:\d+\.\d*)|(?:\d+))\b"
    steps = [
        (
            "extract_price",
            FunctionTransformer(
                price_parse_dtype, kw_args=dict(pattern=price_pattern), validate=False
            ),
        ),
        ("impute", SimpleImputer(strategy="constant", fill_value=0)),
        ("min_max_scale", MinMaxScaler()),
    ]
    return steps


def rating_agg_pipeline_steps():
    steps = [
        ("impute", SimpleImputer(strategy="constant", fill_value=0)),
        ("normalize", StandardScaler()),
    ]
    return steps


In [22]:
# Define the transformations for the columns
rating_agg_cols = [feature.split(":")[1] for feature in item_features]

tfm = [
    ("main_category", OneHotEncoder(handle_unknown="ignore"), ["main_category"]),
    ("title", Pipeline(title_pipeline_steps()), ["title"]),
    ("description", Pipeline(description_pipeline_steps()), "description"),
    (
        "categories",
        Pipeline(categories_pipeline_steps()),
        "categories",
    ),  # Count Vectorizer for multi-label categorical
    (
        "price",
        Pipeline(price_pipeline_steps()),
        "price",
    ),  # Normalizing price
    # (
    #     "rating_agg",
    #     Pipeline(rating_agg_pipeline_steps()),
    #     rating_agg_cols,
    # ),
]
meta_cols = ["main_category", "title", "description", "categories", "price"]
cols = meta_cols + rating_agg_cols
cols

['main_category',
 'title',
 'description',
 'categories',
 'price',
 'parent_asin_rating_cnt_365d',
 'parent_asin_rating_avg_prev_rating_365d',
 'parent_asin_rating_cnt_90d',
 'parent_asin_rating_avg_prev_rating_90d',
 'parent_asin_rating_cnt_30d',
 'parent_asin_rating_avg_prev_rating_30d',
 'parent_asin_rating_cnt_7d',
 'parent_asin_rating_avg_prev_rating_7d']

In [23]:
def check_dup(df):
    assert (
        df[[args.user_col, args.item_col, args.timestamp_col]].duplicated().sum() == 0
    )

In [24]:
# Merge the item features into the interaction data
train_features_df = pd.merge(
    train_df, metadata_raw_df[[args.item_col] + meta_cols], how="left", on=args.item_col
)
val_features_df = pd.merge(
    val_df, metadata_raw_df[[args.item_col] + meta_cols], how="left", on=args.item_col
)
check_dup(train_features_df)
check_dup(val_features_df)
train_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,...,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp,item_sequence_ts,item_sequence_ts_bucket,main_category,title,description,categories,price
0,AFZ4EK2LJ655XQKTEUELCARO6RYA,B00002EQCW,4.0,2003-01-23 03:28:15.000,8071,4,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1043292495,0.0,,...,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",All Electronics,NETGEAR FS105NA - Discontinued by Manufacturer,"[Product Description, The NETGEAR FS105NA Unma...","[Electronics, Computers & Accessories, Network...",57.89
1,AFY2C4YOUP2SSMM43HD2L3FIEFZA,B00008SCFL,5.0,2003-11-25 18:12:09.000,7935,36,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1069783929,0.0,,...,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",All Electronics,Netgear WGR614 Wireless-G Router,"[Product Description, NETGEAR's Cable/DSL 54 M...","[Electronics, Computers & Accessories, Network...",39.5
2,AHF3TGIOSTD2UCHF3MO4MIHFJ5NQ,B07KQWX947,5.0,2004-06-18 02:02:57.000,13705,3514,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1087524177,0.0,,...,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",All Electronics,Koss Porta Pro Black On Ear Headphones with Ca...,[New porta Pro active lifestyle headphone in B...,"[Electronics, Headphones, Earbuds & Accessorie...",39.99
3,AH5Z47PJ5RTSUL2RLCO2QITGIT4Q,B00008SCFL,4.0,2004-09-13 20:18:44.000,12730,36,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1095106724,1.0,5.000000,...,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",All Electronics,Netgear WGR614 Wireless-G Router,"[Product Description, NETGEAR's Cable/DSL 54 M...","[Electronics, Computers & Accessories, Network...",39.5
4,AEX3L4NKDESOCGWOFNF63GRFGXCA,B00WUI8JN0,5.0,2004-10-22 14:26:12.000,3735,2103,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",1098455172,0.0,,...,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",Camera & Photo,Canon Advanced Two Lens Kit with 50mm f/1.4 an...,[Advance to a new realm of image quality with ...,"[Electronics, Camera & Photo, Lenses, Camera L...",1179.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127387,AFYJD6ZQUSCLITYNXRCUM3VNA5FA,B07H65KP63,5.0,2020-12-26 20:31:17.604,7998,3454,"[-1, -1, -1, -1, -1, -1, 3470, 3381, 3089, 3452]",1609014677,138.0,4.456522,...,,"B07HYK77H3,B07F4P3JH7,B075X8471B,B07H5S7X4X",1446574200154845854815484590471562273928,"[-1, -1, -1, -1, -1, -1, 1446574200, 154845854...","[-1, -1, -1, -1, -1, -1, 8, 6, 6, 6]",Amazon Devices,"Echo Dot (3rd Gen, 2018 release) - Smart speak...",[],[],
127388,AEI4EB4JE4EH4FXWDRU5RVUQ7QYA,B07MLY3JKV,3.0,2020-12-26 20:34:25.197,1828,3545,"[-1, -1, -1, -1, -1, 3809, 2694, 3171, 4710, 3...",1609014865,59.0,4.372881,...,,"B081C4XWXZ,B01K8B8YA8,B078H4YD2L,B0C1JGLV8T,B0...","1332075091,1481768860,1591394999,1591395198,15...","[-1, -1, -1, -1, -1, 1332075091, 1481768860, 1...","[-1, -1, -1, -1, -1, 8, 7, 5, 5, 5]",Amazon Devices,Echo Flex - Plug-in mini smart speaker with Alexa,[],[],
127389,AES2U6KIAORYLTBPENQWMDVALTDQ,B07ZZVX1F2,5.0,2020-12-26 21:37:58.968,3109,3800,"[-1, -1, -1, -1, -1, -1, 2237, 2694, 934, 3443]",1609018678,19.0,4.473684,...,5.0,"B010BWYDYA,B01K8B8YA8,B008GVOVK0,B07GZFM1ZM",1446944671151543956215265031001608037276,"[-1, -1, -1, -1, -1, -1, 1446944671, 151543956...","[-1, -1, -1, -1, -1, -1, 8, 6, 6, 4]",Amazon Devices,Fire TV Stick with Alexa Voice Remote (include...,[],[],
127390,AGU6SDEIMLBQZII2FVFJ6YIUZRKQ,B0BSF5LM3J,5.0,2020-12-26 22:29:54.459,11489,4622,"[4288, 58, 4531, 4616, 3293, 107, 2858, 4109, ...",1609021794,3.0,5.000000,...,,"B09RS2KZK4,B0002MQGK4,B0BJ13Q5JG,B0BS2ZMHCL,B0...","1439768754,1441664091,1441664113,1450455030,14...","[1439768754, 1441664091, 1441664113, 145045503...","[8, 8, 8, 8, 8, 7, 7, 6, 6, 5]",All Electronics,J-Tech Digital Premium Quality 1080P HDMI to H...,[],"[Electronics, Television & Video, Accessories,...",25.95


In [25]:
# papermill_description=fit-tfm-pipeline
preprocessing_pipeline = ColumnTransformer(
    transformers=tfm, remainder="drop"  # Drop any columns not specified in transformers
)

# Create a pipeline object
item_metadata_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessing_pipeline),
        (
            "normalizer",
            StandardScaler(),
        ),  # Normalize the numerical outputs since it's an important preconditions for any Deep Learning models
    ]
)

# Fit the pipeline
# Drop duplicated item so that the Pipeline only fit the unique item features
fit_df = train_features_df.drop_duplicates(subset=[args.item_col])
item_metadata_pipeline.fit(fit_df)

0,1,2
,steps,"[('preprocessing', ...), ('normalizer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('main_category', ...), ('title', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,''
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function res...x7e2e8d798cc0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,func,<function tod...x7e2e7a4f1d00>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function fla...x7e2e8f7698a0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,func,<function tod...x7e2e7a4f1d00>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function fla...x7e2e8f7698a0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,<function tok...x7e2e7aa24040>
,stop_words,
,token_pattern,
,ngram_range,"(1, ...)"

0,1,2
,func,<function tod...x7e2e7a4f1d00>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function pri...x7e2e7aa24180>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,{'pattern': '\\b((?:\\d+\\.\\d*)|(?:\\d+))\\b'}
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [26]:
# Reclaim memory
import gc

del fit_df
del train_df
del val_df
gc.collect()

47

In [27]:
import numpy as np
import pandas as pd
from scipy.sparse import issparse
from tqdm.auto import tqdm


def chunk_transform(df, pipeline, chunk_size=1000):
    transformed_chunks = []

    progress_bar = tqdm(range(0, df.shape[0], chunk_size), desc="Transforming chunks")

    # Iterate through the DataFrame in chunks
    for start in progress_bar:
        end = min(start + chunk_size, df.shape[0])
        chunk_df = df.iloc[start:end]

        # Apply the pipeline transformation to the chunk
        transformed_chunk = pipeline.transform(chunk_df)

        # Check if the transformed output is sparse, and convert to dense
        if issparse(transformed_chunk):
            transformed_chunk = transformed_chunk.toarray()

        # Collect the transformed chunk
        transformed_chunks.append(transformed_chunk)

    # Concatenate the transformed chunks into a single NumPy array
    transformed_full = np.vstack(transformed_chunks)

    return transformed_full


def parse_dt(df, cols=["timestamp"]):
    return df.assign(
        **{
            col: lambda df: pd.to_datetime(df[col].astype(int), unit="ms")
            for col in cols
        }
    )


def handle_dtypes(df):
    return df.assign(rating=lambda df: df["rating"].astype(float))


In [28]:
# papermill_description=chunk-transform
# Transform the data in chunks to avoid OOM
transformed_item_metadata = chunk_transform(
    train_features_df, item_metadata_pipeline, chunk_size=args.tfm_chunk_size
)

logger.info(f"Transformed Item Metadata Shape: {transformed_item_metadata.shape}")

Transforming chunks:   0%|          | 0/26 [00:00<?, ?it/s]

[32m2025-07-01 00:11:21.695[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mTransformed Item Metadata Shape: (127392, 2618)[0m


In [29]:
logger.info(f"Checking stats...")
transformed_df_stats = (
    pd.DataFrame(transformed_item_metadata)
    .sample(10000)
    .T.assign(mean=lambda df: df.mean(axis=1), std=lambda df: df.std(axis=1))[
        ["mean", "std"]
    ]
)
transformed_df_stats

[32m2025-07-01 00:11:21.719[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mChecking stats...[0m


Unnamed: 0,mean,std
0,-0.016947,0.566806
1,-0.000527,0.981533
2,-0.024849,0.983774
3,0.270339,1.477800
4,-0.016707,0.694233
...,...,...
2613,-0.006063,0.931289
2614,-0.026962,0.694533
2615,-0.009272,0.870187
2616,0.003095,1.060132


In [30]:
px.scatter(transformed_df_stats, x="mean", y="std")

In [31]:
assert (
    -1 < transformed_df_stats["mean"].mean() < 1
), "Transformed mean is not centerred at 0"
assert (
    0 < transformed_df_stats["std"].mean() < 2
), "Transformed avg is not centerred at 1"

In [32]:
with open("../data_for_ai/interim/item_metadata_pipeline_wo_user_item_manipulate.dill", "wb") as f:
    dill.dump(item_metadata_pipeline, f)

In [33]:
with open("../data_for_ai/interim/item_metadata_pipeline_wo_user_item_manipulate.dill", "rb") as f:
    item_metadata_pipeline = dill.load(f)

In [34]:
train_persist_fp = "../data_for_ai/interim/train_sample_interactions_16407u_features.parquet"
val_persist_fp = "../data_for_ai/interim/val_sample_interactions_16407u_features.parquet"

train_features_df.to_parquet(train_persist_fp, index=False)
val_features_df.to_parquet(val_persist_fp, index=False)