# Prepare features

Prepare the necessary features and transformations

# Set up

In [1]:
import os
import sys

import pandas as pd
from pydantic import BaseModel

sys.path.insert(0, "..")
from datasets import load_dataset

from src.id_mapper import IDMapper, map_indice
from src.sequence.utils import generate_item_sequences
from src.viz import custom_style_plotly

custom_style_plotly()

  from .autonotebook import tqdm as notebook_tqdm


# Controller

In [2]:
class Args(BaseModel):
    run_name: str = "000-sample-rich-data"
    testing: bool = True
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "000-sample-rich-data",
  "testing": true,
  "notebook_persist_dp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/notebooks/data/000-sample-rich-data",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "sequence_length": 10
}


# Load data

In [3]:
metadata_raw = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Books", trust_remote_code=True
)
metadata_raw_df = metadata_raw["full"].to_pandas()
metadata_raw_df

Generating full split: 4448181 examples [04:16, 17335.12 examples/s]


Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Books,Chaucer,4.5,29,[],[],8.23,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Peter Ackroyd (Author),"[Books, Literature & Fiction, History & Critic...","{""Publisher"": ""Chatto & Windus; First Edition ...",0701169850,,"Hardcover – Import, January 1, 2004",{'avatar': 'https://m.media-amazon.com/images/...
1,Books,Notes from a Kidwatcher,5.0,1,[Contains 23 selected articles by this influen...,"[About the Author, SANDRA WILDE, Ph.D., is wid...",3.52,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Sandra Wilde (Editor),"[Books, Reference, Words, Language & Grammar]","{""Publisher"": ""Heinemann; First Edition (May 2...",0435088688,,First Edition,{'avatar': 'https://m.media-amazon.com/images/...
2,Books,Service: A Navy SEAL at War,4.7,3421,"[Marcus Luttrell, author of the #1 bestseller,...","[Review, Praise for SERVICE""An action-packed.....",17.17,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}","Marcus Luttrell (Author), James D. Hornfischer","[Books, Biographies & Memoirs, Leaders & Notab...","{""Publisher"": ""Little, Brown and Company; 1st ...",0316185361,,"Hardcover – May 8, 2012",{'avatar': 'https://m.media-amazon.com/images/...
3,Books,Monstrous Stories #4: The Day the Mice Stood S...,4.4,40,"[Funny, light-hearted monster stories that are...",[],7.43,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Dr. Roach (Author),"[Books, Children's Books, Science Fiction & Fa...","{""Publisher"": ""Scholastic Paperbacks; Reprint ...",0545425573,,"Paperback – October 29, 2013",
4,Buy a Kindle,Parker & Knight,4.5,381,"[From REMINGTON KANE, the author of The Taken!...",[],0.0,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Remington Kane (Author) Format: Kindle Edition,"[Books, Mystery, Thriller & Suspense, Thriller...","{""Publication date"": ""May 18, 2014"", ""Language...",B00KFOP3RG,,Kindle Edition,{'avatar': 'https://m.media-amazon.com/images/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4448176,Books,Please Excuse My Daughter,4.3,69,"[Look out for Julie's new book,, The Almost Le...","[About the Author, Julie Klam grew up in Bedfo...",36.06,"{'hi_res': [], 'large': [], 'thumb': [], 'vari...","{'title': [], 'url': [], 'user_id': []}",Julie Klam (Author),"[Books, Biographies & Memoirs, Community & Cul...","{""Publisher"": ""Riverhead Books; Reprint editio...",1594483574,,"Paperback – April 7, 2009",{'avatar': 'https://m.media-amazon.com/images/...
4448177,Books,Inside the Southeast Asian Kitchen: Foodlore a...,5.0,1,[A sumptuous gastronomic tour of ten Southeast...,[],75.0,"{'hi_res': [], 'large': [], 'thumb': [], 'vari...","{'title': [], 'url': [], 'user_id': []}",Artpostasia Pte Ltd. (Author),"[Books, Cookbooks, Food & Wine, Regional & Int...","{""Publisher"": ""Art Post Asia (January 1, 2007)...",9719317051,,"Paperback – January 1, 2007",
4448178,Books,Origin of Negative Dialectics,4.9,16,[Susan Buck-Morss examines and stresses the si...,"[About the Author, Susan Buck-Morss is Disting...",18.39,"{'hi_res': [], 'large': [], 'thumb': [], 'vari...","{'title': [], 'url': [], 'user_id': []}",Susan Buck-Morss (Author),"[Books, Politics & Social Sciences, Philosophy]","{""Publisher"": ""Free Press; Trade edition (Dece...",0029051509,,"Paperback – December 1, 1979",{'avatar': 'https://m.media-amazon.com/images/...
4448179,Books,Trails Illustrated National Parks Guadalupe Mo...,4.8,121,[],[],4.99,"{'hi_res': [], 'large': [], 'thumb': [], 'vari...","{'title': [], 'url': [], 'user_id': []}",,"[Books, Reference, Atlases & Maps]","{""Language"": ""English"", ""ISBN 10"": ""0925873039...",0925873039,,Map,{'avatar': 'https://m.media-amazon.com/images/...


In [4]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")

In [5]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
32,AH6CATODIVPVUOJEWHRSRCSKAOHA,0399240462,5.0,1398818354000
34,AH6CATODIVPVUOJEWHRSRCSKAOHA,0547248288,5.0,1415484437000
35,AH6CATODIVPVUOJEWHRSRCSKAOHA,141694737X,5.0,1416395330000
37,AH6CATODIVPVUOJEWHRSRCSKAOHA,0374360979,5.0,1420650726000
40,AH6CATODIVPVUOJEWHRSRCSKAOHA,0671493205,3.0,1420651291000
...,...,...,...,...
8721774,AHQXEZQVLUATXYKVH46342AIYRPA,B01D4VFDTO,5.0,1458315354000
8721777,AHQXEZQVLUATXYKVH46342AIYRPA,B01N0TBGJC,5.0,1482962333000
8721778,AHQXEZQVLUATXYKVH46342AIYRPA,B01N32NQTH,5.0,1486058822000
8721779,AHQXEZQVLUATXYKVH46342AIYRPA,B06VSBSMQV,5.0,1488318530000


IDMapper is the class responsible for mapping original string indice to integer indice since our model expect the integer indexing.

In [6]:
user_ids = train_df[args.user_col].values
item_ids = train_df[args.item_col].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [7]:
train_df = train_df.pipe(map_indice, idm, args.user_col, args.item_col)
val_df = val_df.pipe(map_indice, idm, args.user_col, args.item_col)

In [8]:
idm.save("../data/idm.json")
idm = IDMapper().load("../data/idm.json")

# Item sequence

Historical interactions is normally a very important feature in recommendations. Here we explicitly prepare the list of items a user interacted with regarding every row.

## Test implementation

In [9]:
# Sample DataFrame
data = {
    "user_indices": [0, 0, 1, 1, 1],
    "item_indices": [0, 1, 2, 3, 4],
    "timestamp": [0, 1, 2, 3, 4],
    "ratings": [1, 4, 5, 3, 2],
}

df = pd.DataFrame(data)

# Generate the item sequences
df_with_sequences = generate_item_sequences(
    df,
    user_col="user_indices",
    item_col="item_indices",
    timestamp_col="timestamp",
    sequence_length=3,
    padding=True,
    padding_value=-1,
)

df_with_sequences

Unnamed: 0,user_indices,item_indices,timestamp,ratings,item_sequence
0,0,0,0,1,"[-1.0, -1.0, -1.0]"
1,0,1,1,4,"[-1, -1, 0]"
2,1,2,2,5,"[-1.0, -1.0, -1.0]"
3,1,3,3,3,"[-1, -1, 2]"
4,1,4,4,2,"[-1, 2, 3]"


## Run with real data

In [10]:
full_df = pd.concat(
    [train_df.assign(source="train"), val_df.assign(source="val")],
    axis=0,
)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,source
32,AH6CATODIVPVUOJEWHRSRCSKAOHA,0399240462,5.0,1398818354000,4008,6910,train
34,AH6CATODIVPVUOJEWHRSRCSKAOHA,0547248288,5.0,1415484437000,4008,6610,train
35,AH6CATODIVPVUOJEWHRSRCSKAOHA,141694737X,5.0,1416395330000,4008,7151,train
37,AH6CATODIVPVUOJEWHRSRCSKAOHA,0374360979,5.0,1420650726000,4008,3367,train
40,AH6CATODIVPVUOJEWHRSRCSKAOHA,0671493205,3.0,1420651291000,4008,1166,train
...,...,...,...,...,...,...,...
414351,AHUJXB632AXB4QJF6Y25TYEEV2IA,B08L3B4VH4,5.0,1629146298574,18035,1257,val
414352,AHUJXB632AXB4QJF6Y25TYEEV2IA,B07SJRYDDH,5.0,1633143544897,18035,4566,val
414353,AHUJXB632AXB4QJF6Y25TYEEV2IA,B08H1TM3ZR,5.0,1643718266632,18035,5724,val
414354,AHUJXB632AXB4QJF6Y25TYEEV2IA,B01HZFB38U,5.0,1645724532943,18035,6163,val


In [11]:
df_with_sequences = generate_item_sequences(
    full_df,
    user_col=args.user_col,
    item_col="item_indice",
    timestamp_col=args.timestamp_col,
    sequence_length=args.sequence_length,
    padding=True,
    padding_value=-1,
)

In [12]:
with pd.option_context("display.max_colwidth", None):
    display(df_with_sequences[[args.user_col, "item_indice", "item_sequence"]])

Unnamed: 0,user_id,item_indice,item_sequence
4328616,AE224PFXAEAT66IXX43GRJSWHXCA,4732,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
4328618,AE224PFXAEAT66IXX43GRJSWHXCA,1581,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4732]"
4328624,AE224PFXAEAT66IXX43GRJSWHXCA,2712,"[-1, -1, -1, -1, -1, -1, -1, -1, 4732, 1581]"
4328627,AE224PFXAEAT66IXX43GRJSWHXCA,4217,"[-1, -1, -1, -1, -1, -1, -1, 4732, 1581, 2712]"
4328634,AE224PFXAEAT66IXX43GRJSWHXCA,6558,"[-1, -1, -1, -1, -1, -1, 4732, 1581, 2712, 4217]"
...,...,...,...
4627387,AHZZZ6UASY7CGOTGP5BH5637FMPA,6998,"[-1, -1, -1, -1, -1, 4088, 2670, 6456, 1416, 3853]"
4627389,AHZZZ6UASY7CGOTGP5BH5637FMPA,5013,"[-1, -1, -1, -1, 4088, 2670, 6456, 1416, 3853, 6998]"
4627390,AHZZZ6UASY7CGOTGP5BH5637FMPA,1099,"[-1, -1, -1, 4088, 2670, 6456, 1416, 3853, 6998, 5013]"
4627391,AHZZZ6UASY7CGOTGP5BH5637FMPA,685,"[-1, -1, 4088, 2670, 6456, 1416, 3853, 6998, 5013, 1099]"


In [18]:
# Check sample user
user_id = df_with_sequences.sample(n=1)[args.user_col].values[0]

with pd.option_context('display.max_colwidth', None):
    display(
        df_with_sequences.loc[lambda df: df[args.user_col].eq(user_id)]
        .sort_values(args.timestamp_col)[
            [args.user_col, args.timestamp_col, "item_indice", "item_sequence"]
        ]
        .tail(10)
    )

Unnamed: 0,user_id,timestamp,item_indice,item_sequence
742379,AGDXNXDIKPFAVOQYM7S4MNPZ66FA,1513733748231,1170,"[5793, 354, 2963, 2013, 5625, 5897, 3577, 7143, 6343, 5820]"
742381,AGDXNXDIKPFAVOQYM7S4MNPZ66FA,1516225101028,815,"[354, 2963, 2013, 5625, 5897, 3577, 7143, 6343, 5820, 1170]"
742382,AGDXNXDIKPFAVOQYM7S4MNPZ66FA,1520175484601,1606,"[2963, 2013, 5625, 5897, 3577, 7143, 6343, 5820, 1170, 815]"
742383,AGDXNXDIKPFAVOQYM7S4MNPZ66FA,1520881818838,4244,"[2013, 5625, 5897, 3577, 7143, 6343, 5820, 1170, 815, 1606]"
742384,AGDXNXDIKPFAVOQYM7S4MNPZ66FA,1520989902710,3863,"[5625, 5897, 3577, 7143, 6343, 5820, 1170, 815, 1606, 4244]"
742385,AGDXNXDIKPFAVOQYM7S4MNPZ66FA,1525568876925,2624,"[5897, 3577, 7143, 6343, 5820, 1170, 815, 1606, 4244, 3863]"
742386,AGDXNXDIKPFAVOQYM7S4MNPZ66FA,1551071858702,1576,"[3577, 7143, 6343, 5820, 1170, 815, 1606, 4244, 3863, 2624]"
742387,AGDXNXDIKPFAVOQYM7S4MNPZ66FA,1576458212941,6013,"[7143, 6343, 5820, 1170, 815, 1606, 4244, 3863, 2624, 1576]"
742389,AGDXNXDIKPFAVOQYM7S4MNPZ66FA,1625102886126,2089,"[6343, 5820, 1170, 815, 1606, 4244, 3863, 2624, 1576, 6013]"
36170,AGDXNXDIKPFAVOQYM7S4MNPZ66FA,1637819388045,1847,"[5820, 1170, 815, 1606, 4244, 3863, 2624, 1576, 6013, 2089]"


# Persist

In [14]:
train_features_df = df_with_sequences.loc[lambda df: df["source"].eq("train")].drop(
    columns=["source"]
)
val_features_df = df_with_sequences.loc[lambda df: df["source"].eq("val")].drop(
    columns=["source"]
)

In [15]:
train_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
4328616,AE224PFXAEAT66IXX43GRJSWHXCA,0399159312,2.0,1373291889000,6822,4732,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4328618,AE224PFXAEAT66IXX43GRJSWHXCA,B000FA5TTW,1.0,1382077065000,6822,1581,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4732]"
4328624,AE224PFXAEAT66IXX43GRJSWHXCA,030758836X,1.0,1424138603000,6822,2712,"[-1, -1, -1, -1, -1, -1, -1, -1, 4732, 1581]"
4328627,AE224PFXAEAT66IXX43GRJSWHXCA,B00MSRW6SM,4.0,1437924147000,6822,4217,"[-1, -1, -1, -1, -1, -1, -1, 4732, 1581, 2712]"
4328634,AE224PFXAEAT66IXX43GRJSWHXCA,B00A18VD7A,1.0,1464603674000,6822,6558,"[-1, -1, -1, -1, -1, -1, 4732, 1581, 2712, 4217]"
...,...,...,...,...,...,...,...
4627387,AHZZZ6UASY7CGOTGP5BH5637FMPA,B017GFRJZK,5.0,1508089337653,6916,6998,"[-1, -1, -1, -1, -1, 4088, 2670, 6456, 1416, 3..."
4627389,AHZZZ6UASY7CGOTGP5BH5637FMPA,B00RPM9MJ6,4.0,1521230143557,6916,5013,"[-1, -1, -1, -1, 4088, 2670, 6456, 1416, 3853,..."
4627390,AHZZZ6UASY7CGOTGP5BH5637FMPA,B00RU7SNP0,4.0,1534867184329,6916,1099,"[-1, -1, -1, 4088, 2670, 6456, 1416, 3853, 699..."
4627391,AHZZZ6UASY7CGOTGP5BH5637FMPA,B00HGSVGSY,4.0,1534867223318,6916,685,"[-1, -1, 4088, 2670, 6456, 1416, 3853, 6998, 5..."


In [16]:
val_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
307936,AE23RLRV25THT7OZM4T4ZJ4BMYCA,0062409212,5.0,1646772001708,7581,6271,"[-1, -1, -1, -1, -1, 7353, 4162, 2974, 6055, 3..."
363910,AE26ASIO5XEEZELXTU3UK5Z4TS6A,B089GSVYJW,5.0,1631481525785,12231,7272,"[-1, -1, -1, -1, 6248, 4242, 3322, 2022, 5042,..."
363911,AE26ASIO5XEEZELXTU3UK5Z4TS6A,B07WG8L7WC,4.0,1635636047615,12231,6366,"[-1, -1, -1, 6248, 4242, 3322, 2022, 5042, 830..."
173811,AE27AOVHHMVINOYGUM6QCG2SSE6A,B083J7QQ89,5.0,1644968355783,4200,5017,"[-1, -1, -1, -1, 3810, 5521, 6234, 7041, 7038,..."
242097,AE2CUFERDXJI4XNIGIGJOBDLCOTQ,B07CGCTSNW,5.0,1636754804021,18899,6360,"[-1, -1, 6358, 1630, 1012, 3903, 6683, 986, 71..."
...,...,...,...,...,...,...,...
137390,AHZLM4RDKSICEFEAYEQQRZW45BPA,B08CV9SPDQ,5.0,1657516186943,14008,1406,"[-1, -1, -1, -1, 6183, 7352, 3278, 4810, 2058,..."
180646,AHZLQPSPG675BABC5R5NJW6KG3WQ,B07D6PZ6P1,5.0,1635813519329,13829,6282,"[-1, -1, -1, -1, -1, 5323, 1630, 1844, 230, 5178]"
88385,AHZNQ34GWKKLJN53IDXLAX22OBJQ,B07ZJ2VHBB,5.0,1652978096579,17280,4442,"[-1, -1, -1, -1, -1, 249, 1297, 3552, 4954, 2316]"
88386,AHZNQ34GWKKLJN53IDXLAX22OBJQ,B00PG8UCGS,5.0,1654707732874,17280,5357,"[-1, -1, -1, -1, 249, 1297, 3552, 4954, 2316, ..."


In [17]:
train_features_df.to_parquet("../data/train_features.parquet", index=False)
val_features_df.to_parquet("../data/val_features.parquet", index=False)