# Feature engineering

## Setup

In [2]:
%load_ext autoreload
%autoreload 2

In [10]:
import os
import sys

import dill
import numpy as np
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from src.utils.embedding_id_mapper import IDMapper

sys.path.insert(0, "..")


In [None]:
class Args(BaseModel):
    run_name: str = "000-prep-data"
    testing: bool = False
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating" tr = "timestamp"

    tfm_chunk_size: int = 5000

    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "000-prep-data",
  "testing": false,
  "notebook_persist_dp": "/home/dinhln/Desktop/real_time_recsys/notebooks/data/000-prep-data",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "tfm_chunk_size": 5000,
  "sequence_length": 10
}


## Load data

In [5]:
metadata_raw = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Electronics", trust_remote_code=True
)
metadata_raw_df = metadata_raw["full"].to_pandas()
metadata_raw_df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Electronics,FS-1051 FATSHARK TELEPORTER V3 HEADSET,3.5,6,[],[Teleporter V3 The “Teleporter V3” kit sets a ...,,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Fat Shark,"[Electronics, Television & Video, Video Glasses]","{""Date First Available"": ""August 2, 2014"", ""Ma...",B00MCW7G9M,,,
1,All Electronics,Ce-H22B12-S1 4Kx2K Hdmi 4Port,5.0,1,"[UPC: 662774021904, Weight: 0.600 lbs]",[HDMI In - HDMI Out],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",SIIG,"[Electronics, Television & Video, Accessories,...","{""Product Dimensions"": ""0.83 x 4.17 x 2.05 inc...",B00YT6XQSE,,,
2,Computers,Digi-Tatoo Decal Skin Compatible With MacBook ...,4.5,246,[WARNING: Please IDENTIFY MODEL NUMBER on the ...,[],19.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['AL 2Sides Video', 'MacBook Protect...",Digi-Tatoo,"[Electronics, Computers & Accessories, Laptop ...","{""Brand"": ""Digi-Tatoo"", ""Color"": ""Fresh Marble...",B07SM135LS,,,
3,AMAZON FASHION,NotoCity Compatible with Vivoactive 4 band 22m...,4.5,233,[☛NotoCity 22mm band is designed for Vivoactiv...,[],9.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",NotoCity,"[Electronics, Wearable Technology, Clips, Arm ...","{""Date First Available"": ""May 29, 2020"", ""Manu...",B089CNGZCW,,,
4,Cell Phones & Accessories,Motorola Droid X Essentials Combo Pack,3.8,64,"[New Droid X Essentials Combo Pack, Exclusive ...",[all Genuine High Quality Motorola Made Access...,14.99,"{'hi_res': [None, None, None, None, None], 'la...","{'title': [], 'url': [], 'user_id': []}",Verizon,"[Electronics, Computers & Accessories, Compute...","{""Product Dimensions"": ""11.6 x 6.9 x 3.1 inche...",B004E2Z88O,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1610007,Computers,"Wintec FileMate Pro USB Flash Drive, 3FMUSB32G...",5.0,1,"[32GB / 32 GB file storage, USB mass storage d...",[--New in retail packaging --Fast USB 2.0 data...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Wintec Industries,"[Electronics, Computers & Accessories, Data St...","{""Product Dimensions"": ""0.78 x 0.31 x 2.75 inc...",B003NUIU9M,,,
1610008,,Tsugar Noise Reduction Wireless Headphones Blu...,1.0,2,[High Fidelity Sound: Intelligent noise reduct...,[Description: 100% brand new high quality 1.Hi...,,"{'hi_res': [None, 'https://m.media-amazon.com/...","{'title': [], 'url': [], 'user_id': []}",Tsugar,"[Electronics, Headphones, Earbuds & Accessorie...","{""Best Sellers Rank"": {""Electronics"": 547760, ...",B0BHVY33TL,,,
1610009,,"Hardshell Case for MacBook Pro (16-inch, 2021)...",4.6,11,"[Compatible with MacBook Pro 16-inch (2021), I...",[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Incase Designs,"[Electronics, Computers & Accessories, Laptop ...","{""Product Dimensions"": ""9.88 x 0.94 x 14.13 in...",B09SQGRFFH,,,
1610010,Computers,"FYY 12-13.3"" Laptop Sleeve Case Bag, PU Leathe...",4.0,35,[【Compatibility】FYY laptop Bag sleeve perfect ...,[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",FYY,"[Electronics, Computers & Accessories, Laptop ...","{""Standing screen display size"": ""12.3 Inches""...",B091JWCSG5,,,


In [6]:
with pd.option_context("display.max_colwidth", None):
    display(
        metadata_raw_df.iloc[[6]][
            [
                "title",
                "main_category",
                "categories",
                "features",
                "description",
                "store",
                "details",
            ]
        ]
    )

Unnamed: 0,title,main_category,categories,features,description,store,details
6,"QGHXO Band for Garmin Vivofit 4, Soft Silicone Replacement Watch Band Strap for Garmin Vivofit 4 Activity Tracker, Small, Large, Ten Colors (5PCS Bands-Girl, Large)",Cell Phones & Accessories,"[Electronics, Wearable Technology, Arm & Wristband Accessories]","[Personalized Your Garmin Vivofit 4 Activity Tracker with this refined replacement wrist band, Small fits wrists with a circumference of 122-188mm. Large fits wrists with a circumference of 148-215mm, Easy and direct installation and removal. Replacement Bands Only! Garmin device NOT included, Garmin Vivofit 4 Buckle Bracelet. Never lose your Garmin Vivofit 4. Fix the tracker fall off problem, Soft silicone with smooth finish for a sporty look, metal parts made with high quality stainless steel]","[Compatibility, Custom designed for your precious, Garmin Vivofit 4, Activity Tracker, this Garmin Watch Sport Band features a combination of functionality and style. Fit for, Garmin Vivofit 4, Activity Tracker ONLY. NOT for Garmin Vivofit 1/Garmin Vivofit 2/Garmin Vivofit 3., Feature, Material: Silicone. NOTE: Replacement Bands Only! Small fits wrists with a circumference of 122-188mm. Large fits wrists with a circumference of 148-215mm. Models for selection: For Garmin Vivofit 4 Activity Tracker Only. Contracted design style, with you life contracted and not simple., Package Included, Soft Silicone Replacement Watch Band Strap for Garmin Vivofit 4 Activity Tracker (No Tracker)]",QGHXO,"{""Package Dimensions"": ""6.85 x 4.37 x 1.1 inches"", ""Item Weight"": ""2.64 ounces"", ""Item model number"": ""GM-VF4-L14GIRL"", ""Best Sellers Rank"": {""Electronics"": 317736, ""Smart Arm & Wristband Accessories"": 12926}, ""Is Discontinued By Manufacturer"": ""No"", ""Special features"": ""activity tracker"", ""Other display features"": ""Sports"", ""Color"": ""5PCS Bands-Girl"", ""Manufacturer"": ""QGHXO"", ""Date First Available"": ""March 17, 2018""}"


In [8]:
train_df = pd.read_parquet("../data_for_ai/interim/train_sample_interactions_16407u.parquet")
val_df = pd.read_parquet("../data_for_ai/interim/val_sample_interactions_16407u.parquet")
full_df = (
    pd.concat([train_df, val_df], axis=0)
    .assign(timestamp_unix=lambda df: df[args.timestamp_col].astype("int64") // 10**9)
)

In [9]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10.000,1339432870
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13.000,1343873053
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46.000,1347726886
3204,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B008LURQ76,5.0,2013-01-03 23:08:45.000,1357254525
3208,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B00AQRUW4Q,4.0,2013-05-06 01:24:39.000,1367803479
...,...,...,...,...,...
33760091,AHIIISHZP6YAVVHMDEBLJ5CWZ7ZA,B0BZ62FQ13,3.0,2021-07-16 17:08:55.044,1626455335
34470392,AFTE3G43QHXWD3DJGDCI2DHEWQJQ,B08DMXDPW5,5.0,2021-01-14 01:48:09.423,1610588889
35019360,AFENZZDPVUYFVBS47YDOWJCDYBSQ,B09XBT6DS9,4.0,2021-12-05 00:35:40.874,1638664540
35323250,AFMBZYPDAXT5VO3ME67HW5Q5TAOQ,B097KBF8JK,5.0,2022-02-18 11:32:46.732,1645183966


In [11]:
# Load idm
idm_path = os.path.abspath("../data_for_ai/interim/idm_16407u.json")
idm = IDMapper().load(idm_path)
idm.get_user_id(1)

'AE227WAM4NWQPJI33OPN7ZARNNZQ'