# Store supporting features

# Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
import sys

import pandas as pd
import redis
from dotenv import load_dotenv
from pydantic import BaseModel
from tqdm.auto import tqdm

sys.path.insert(0, "..")

from src.id_mapper import IDMapper
from src.io_utils import init_s3_client

load_dotenv()

# Controller

In [3]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-first-attempt"
    notebook_persist_dp: str = None
    random_seed: int = 41

    top_K: int = 100

    redis_host: str = "localhost"
    redis_port: int = 6379
    redis_recent_key_prefix: str = "feature:user:recent_items:"
    redis_popular_key: str = "output:popular"

    train_features_fp: str = "../data/train_features.parquet"
    val_features_fp: str = "../data/val_features.parquet"
    id_mapper_fp: str = "../data/idm.json"

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    timestamp_col: str = "timestamp"

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        if redis_host := os.getenv("REDIS_HOST"):
            self.redis_host = redis_host
            self.redis_port = os.getenv("REDIS_PORT", self.redis_port)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-first-attempt",
  "notebook_persist_dp": "/Users/quy.dinh/frostmourne/recsys-mvp/notebooks/data/000-first-attempt",
  "random_seed": 41,
  "top_K": 100,
  "redis_host": "localhost",
  "redis_port": 6379,
  "redis_recent_key_prefix": "feature:user:recent_items:",
  "redis_popular_key": "output:popular",
  "train_features_fp": "../data/train_features.parquet",
  "val_features_fp": "../data/val_features.parquet",
  "id_mapper_fp": "../data/idm.json",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "timestamp_col": "timestamp"
}


# Load input data

In [None]:
if not os.path.exists(args.train_features_fp):
    s3 = init_s3_client()
    bucket_name = "data"
    train_key = "train_features.parquet"
    val_key = "val_features.parquet"
    idm_key = "idm.json"

    s3.download_file(bucket_name, train_key, args.train_features_fp)
    s3.download_file(bucket_name, val_key, args.val_features_fp)
    s3.download_file(bucket_name, idm_key, args.id_mapper_fp)

In [4]:
train_features_df = pd.read_parquet(args.train_features_fp)
val_features_df = pd.read_parquet(args.val_features_fp)
idm = IDMapper().load(args.id_mapper_fp)
full_df = pd.concat([train_features_df, val_features_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
0,AFSR5Q6AUWIXDCBJY3Z63SFP7PIQ,B00001KUII,5.0,948686983000,5891,1739,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AEZGYAZLTQUUBN6DHM7OPECPKUYA,B00002EPZ2,5.0,949551425000,14348,3996,Video Games,Planescape: Torment - PC,"[Amazon.com, Explore Sigil, the City of Doors....","[Video Games, PC, Games]",14.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AEKK2OBHEI2MK3EERXMCWLWIU3NQ,B00002NDRY,5.0,949807161000,4467,1520,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,3700,4350,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AFSR5Q6AUWIXDCBJY3Z63SFP7PIQ,B001E91OQA,5.0,951269165000,5891,4350,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
957,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,6294,3694,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, 4336, 4609, 3873, 2245, 2869, 320..."
958,AFIXV7CY3OC6WI5DXCS3JAGP5SQA,B0C37RBK2R,5.0,1657887021161,9028,4103,Video Games,Xbox Series S,"[Introducing the Xbox Series S, the smallest, ...",[],279.0,"[639, 1294, 627, 3644, 4468, 3016, 2610, 2932,..."
959,AFDL3ZQE4ARYEEBBH2KAPMP4NSHQ,B0795GHTBC,5.0,1657910674213,8973,3116,All Electronics,ivoler [3 Pack Screen Protector Tempered Glass...,[],"[Video Games, Nintendo Switch, Accessories, Fa...",9.39,"[-1, -1, -1, 1259, 1366, 2240, 3784, 1073, 570..."
960,AEE72HLCWIZT2GKD7UZRXN36T27A,B0CB8LZT7K,5.0,1657928730786,18414,424,Video Games,Daydayup Switch Carrying Case Compatible with ...,[],"[Video Games, Legacy Systems, Nintendo Systems...",21.99,"[-1, 2370, 1236, 2881, 4266, 3449, 131, 2739, ..."


In [5]:
latest_df = full_df.assign(
    recency=lambda df: df.groupby(args.user_col)[args.timestamp_col].rank(
        method="first", ascending=False
    )
).loc[lambda df: df["recency"].eq(1)]
latest_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence,recency
9,AFSR5Q6AUWIXDCBJY3Z63SFP7PIQ,B001EYUPAQ,5.0,961869703000,5891,205,Video Games,Deus Ex: Game of the Year Edition - PC,"[Product description, Real Conspiracies...Seve...","[Video Games, PC, Games]",69.07,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1739.0, 4...",1.0
81,AGI2TRAJXLJFMCFCVRTE5TXJLZOA,B000038ABO,4.0,977439956000,5321,3300,Video Games,Parasite Eve,"[Product description, One of them is a police ...","[Video Games, Legacy Systems, PlayStation Syst...",144.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, 4355.0, 2879.0,...",1.0
110,AFHEODRO4ABX45Q62AGSEU5VR5SQ,B00004TCT3,5.0,979868084000,9549,480,Video Games,"Pokemon, Silver Version","[Product Description, Pokemon Gold and Silver ...","[Video Games, Legacy Systems, Nintendo Systems...",124.95,"[-1.0, -1.0, -1.0, 3969.0, 4350.0, 4578.0, 531...",1.0
128,AHNLE4FJOHIHA3HUOA5PG4LM4EAA,B00000K3X9,5.0,982539267000,203,2722,Video Games,Sonic Adventure - Sega Dreamcast,"[Product description, Sega's beloved blue masc...","[Video Games, Legacy Systems, Sega Systems, Se...",99.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3197.0, 1...",1.0
168,AGW5FRNMVFQGVQJWQYQGMDI6C2UQ,B00004U5VK,5.0,987548112000,7589,2231,Video Games,Onimusha Warlords,"[Product Description, Set during the medieval ...","[Video Games, Legacy Systems, PlayStation Syst...",7.49,"[-1.0, -1.0, -1.0, -1.0, 3197.0, 4088.0, 3656....",1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
957,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,6294,3694,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, 4336, 4609, 3873, 2245, 2869, 320...",1.0
958,AFIXV7CY3OC6WI5DXCS3JAGP5SQA,B0C37RBK2R,5.0,1657887021161,9028,4103,Video Games,Xbox Series S,"[Introducing the Xbox Series S, the smallest, ...",[],279.0,"[639, 1294, 627, 3644, 4468, 3016, 2610, 2932,...",1.0
959,AFDL3ZQE4ARYEEBBH2KAPMP4NSHQ,B0795GHTBC,5.0,1657910674213,8973,3116,All Electronics,ivoler [3 Pack Screen Protector Tempered Glass...,[],"[Video Games, Nintendo Switch, Accessories, Fa...",9.39,"[-1, -1, -1, 1259, 1366, 2240, 3784, 1073, 570...",1.0
960,AEE72HLCWIZT2GKD7UZRXN36T27A,B0CB8LZT7K,5.0,1657928730786,18414,424,Video Games,Daydayup Switch Carrying Case Compatible with ...,[],"[Video Games, Legacy Systems, Nintendo Systems...",21.99,"[-1, 2370, 1236, 2881, 4266, 3449, 131, 2739, ...",1.0


# Load recent interacted items into Redis

In [6]:
r = redis.Redis(host=args.redis_host, port=args.redis_port, db=0, decode_responses=True)
assert (
    r.ping()
), f"Redis at {args.redis_host}:{args.port} is not running, please make sure you have started the Redis docker service"

In [7]:
latest_df[[args.user_col, args.item_col, "item_sequence"]]

Unnamed: 0,user_id,parent_asin,item_sequence
9,AFSR5Q6AUWIXDCBJY3Z63SFP7PIQ,B001EYUPAQ,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1739.0, 4..."
81,AGI2TRAJXLJFMCFCVRTE5TXJLZOA,B000038ABO,"[-1.0, -1.0, -1.0, -1.0, -1.0, 4355.0, 2879.0,..."
110,AFHEODRO4ABX45Q62AGSEU5VR5SQ,B00004TCT3,"[-1.0, -1.0, -1.0, 3969.0, 4350.0, 4578.0, 531..."
128,AHNLE4FJOHIHA3HUOA5PG4LM4EAA,B00000K3X9,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3197.0, 1..."
168,AGW5FRNMVFQGVQJWQYQGMDI6C2UQ,B00004U5VK,"[-1.0, -1.0, -1.0, -1.0, 3197.0, 4088.0, 3656...."
...,...,...,...
957,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,"[-1, -1, -1, 4336, 4609, 3873, 2245, 2869, 320..."
958,AFIXV7CY3OC6WI5DXCS3JAGP5SQA,B0C37RBK2R,"[639, 1294, 627, 3644, 4468, 3016, 2610, 2932,..."
959,AFDL3ZQE4ARYEEBBH2KAPMP4NSHQ,B0795GHTBC,"[-1, -1, -1, 1259, 1366, 2240, 3784, 1073, 570..."
960,AEE72HLCWIZT2GKD7UZRXN36T27A,B0CB8LZT7K,"[-1, 2370, 1236, 2881, 4266, 3449, 131, 2739, ..."


In [8]:
for i, row in tqdm(latest_df.iterrows(), total=latest_df.shape[0]):
    prev_item_indices = [int(item) for item in row["item_sequence"] if item != -1]
    prev_item_ids = [idm.get_item_id(idx) for idx in prev_item_indices]
    updated_item_sequences = prev_item_ids + [row[args.item_col]]
    user_id = row[args.user_col]
    key = args.redis_recent_key_prefix + user_id
    value = "__".join(updated_item_sequences)
    r.set(key, value)

  0%|          | 0/19578 [00:00<?, ?it/s]

In [9]:
test_user_id = latest_df.sample(1)[args.user_col].values[0]
r.get(args.redis_recent_key_prefix + test_user_id)

'B0002RQ3ES__B001EYUW4A__B001G60638__B000BXKA38__B00J5C3Z10__B014R4KYMS__B07DKYN13M__B09B14PJCG__B087XRWHHL'

# Load popular items into Redis

In [10]:
popular_recs = (
    full_df.groupby(args.item_col).size().sort_values(ascending=False).head(args.top_K)
)
popular_recs

parent_asin
B01N3ASPNV    755
B07YBXFDYN    755
B0086VPUHI    720
B00BGA9WK2    652
B00BN5T30E    544
             ... 
B07YBX7Y3P    166
B00CMQTVK0    164
B004I1JTEK    164
B00HGLLRV2    164
B00CMQTVUA    163
Length: 100, dtype: int64

In [11]:
key = args.redis_popular_key
value = json.dumps(
    {
        "rec_item_ids": popular_recs.index.tolist(),
        "rec_scores": popular_recs.values.tolist(),
    }
)
r.set(key, value)

True

## Test get data from Redis

In [12]:
redis_data = json.loads(r.get(key))
print(redis_data)
assert len(redis_data["rec_item_ids"]) == args.top_K