In [1]:
import os
import sys
from pydantic import BaseModel
import pandas as pd

sys.path.insert(0, '..')

# Controller

In [2]:
class Args(BaseModel):
    run_name: str = '056-small-rich-dataset'
    testing: bool = True
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'
    
    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "056-small-rich-dataset",
  "testing": true,
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/056-small-rich-dataset",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "sequence_length": 10
}


# Load data

In [3]:
from src.id_mapper import IDMapper
from src.train_utils import map_indice

In [4]:
train_df = pd.read_parquet("../data/train_item_features.parquet")
val_df = pd.read_parquet("../data/val_item_features.parquet")

In [5]:
user_ids = train_df[args.user_col].values
item_ids = train_df[args.item_col].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [6]:
train_df = train_df.pipe(map_indice, idm, args.user_col, args.item_col)
val_df = val_df.pipe(map_indice, idm, args.user_col, args.item_col)

In [7]:
full_df = pd.concat([train_df.assign(source='train'), val_df.assign(source='val')], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,source
0,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0050SVNZ8,4.0,1321885664000,Video Games,Amazon Basics Carrying Case for Nintendo - New...,[],"[Video Games, Legacy Systems, Nintendo Systems...",,2837,774,train
1,AHATA6X6MYTC3VNBFJ3WIYVK257A,B00LZVNWIA,4.0,1408233606000,Computers,Logitech G402 Hyperion Fury FPS Gaming Mouse,[Logitech G402 Hyperion Fury FPS Gaming Mouse],"[Video Games, PC, Accessories, Gaming Mice]",,2837,3235,train
2,AHATA6X6MYTC3VNBFJ3WIYVK257A,B074RNL1RX,5.0,1511753174174,Video Games,Razer Wolverine Ultimate Officially Licensed X...,[Play anywhere with the Razer Wolverine Ultima...,"[Video Games, PC, Accessories, Controllers]",64.98,2837,2348,train
3,AHATA6X6MYTC3VNBFJ3WIYVK257A,B089QYP649,5.0,1531092820696,Video Games,Turtle Beach Stealth 600 Wireless Surround Sou...,[The Turtle Beach Stealth 600 is the latest wi...,"[Video Games, PlayStation 4, Accessories, Head...",168.75,2837,489,train
4,AHATA6X6MYTC3VNBFJ3WIYVK257A,B07DHNX18W,4.0,1604348335046,Computers,Razer Huntsman Elite Gaming Keyboard: Fast Key...,[Introduces the new Razer Opto-Mechanical swit...,"[Video Games, PC, Accessories, Gaming Keyboards]",219.99,2837,447,train
...,...,...,...,...,...,...,...,...,...,...,...,...
944,AELRDP5MCGSCANM6GWUXAMBN75LQ,B009AGXH64,5.0,1654280616536,Video Games,Nintendo Wii U Console - 32GB Black Deluxe Set,[Wii U is the next great gaming console from N...,"[Video Games, Legacy Systems, Nintendo Systems...",199.99,11120,763,val
945,AFF5MP52H46DQM63YYLULLCEYAVQ,B08DF248LD,5.0,1630110810552,Video Games,Xbox Core Wireless Controller – Carbon Black,[Experience the modernized design of the Xbox ...,[],45.5,155,1294,val
946,AG25CXR2DXZV62WNVA46GAF2BL2Q,B08LT6PT1X,5.0,1638647139059,Video Games,Xbox Elite Wireless Controller Series 2 – Black,[Experience the Xbox Elite Wireless Controller...,"[Video Games, Xbox One, Accessories, Controllers]",144.99,9161,2357,val
947,AEOY2365QPPEVDTOXL6N7ZA4NSAA,B00PDRZG9U,5.0,1628820275218,Video Games,Code Name: S.T.E.A.M.,"[Launch S.T.E.A.M., an elite team of steam-pow...","[Video Games, Legacy Systems, Nintendo Systems...",12.99,3982,679,val


In [8]:
idm.save("../data/idm.json")
idm = IDMapper().load("../data/idm.json")

# Item sequence

## Test implementation

In [9]:
from src.sequence_utils import generate_item_sequences

# Sample DataFrame
data = {
    'user_indices': [0, 0, 1, 1, 1],
    'item_indices': [0, 1, 2, 3, 4],
    'timestamp': [0, 1, 2, 3, 4],
    'ratings': [1, 4, 5, 3, 2]
}

df = pd.DataFrame(data)

# Generate the item sequences
df_with_sequences = generate_item_sequences(
    df,
    user_col='user_indices',
    item_col='item_indices',
    timestamp_col='timestamp',
    sequence_length=3,
    padding=True,
    padding_value=-1
)

df_with_sequences

Unnamed: 0,user_indices,item_indices,timestamp,ratings,item_sequence
0,0,0,0,1,"[-1.0, -1.0, -1.0]"
1,0,1,1,4,"[-1, -1, 0]"
2,1,2,2,5,"[-1.0, -1.0, -1.0]"
3,1,3,3,3,"[-1, -1, 2]"
4,1,4,4,2,"[-1, 2, 3]"


## Run with real data

In [10]:
df_with_sequences = generate_item_sequences(
    full_df,
    user_col=args.user_col,
    item_col="item_indice",
    timestamp_col=args.timestamp_col,
    sequence_length=args.sequence_length,
    padding=True,
    padding_value=-1
)

In [11]:
with pd.option_context("display.max_colwidth", None):
    display(df_with_sequences[[args.user_col, 'item_indice', 'item_sequence']])

Unnamed: 0,user_id,item_indice,item_sequence
31239,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,4186,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
103639,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,791,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
147159,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,4400,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
129526,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,2433,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
129527,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,3027,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2433]"
...,...,...,...
49,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,3714,"[1815, 4445, 4013, 1742, 4130, 1565, 1252, 945, 824, 2100]"
240,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,2099,"[-1, -1, -1, -1, -1, 3122, 4550, 2203, 1331, 1351]"
661,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,3726,"[-1, -1, -1, 3575, 3331, 2684, 4352, 3393, 2057, 1100]"
237,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,4298,"[4328, 4344, 3319, 2180, 2315, 3243, 1797, 979, 3241, 3989]"


In [12]:
user_id = df_with_sequences.sample(n=1)[args.user_col].values[0]

(
    df_with_sequences
    .loc[lambda df: df[args.user_col].eq(user_id)]
    .sort_values(args.timestamp_col)
    [[args.user_col, 'item_indice', 'item_sequence']]
)

Unnamed: 0,user_id,item_indice,item_sequence


# Item features

## Persist

# Persist

In [13]:
train_features_df = df_with_sequences.loc[lambda df: df['source'].eq('train')].drop(columns=['source'])
val_features_df = df_with_sequences.loc[lambda df: df['source'].eq('val')].drop(columns=['source'])

In [14]:
train_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
31239,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,B00001IVB4,5.0,942965209000,Video Games,Sim Theme Park - PC,[],"[Video Games, PC, Games]",35.0,9784,4186,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
103639,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,B00002NDRY,3.0,947856017000,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,14535,791,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
147159,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,19437,4400,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
129526,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B001EYUWY0,5.0,952016747000,Video Games,Unreal Tournament - PlayStation 2,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, PlayStation Syst...",41.53,11393,2433,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
129527,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B00001KUII,5.0,952143204000,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,11393,3027,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2433]"
...,...,...,...,...,...,...,...,...,...,...,...,...
69639,AHBJMDNN464YPLDJI63F4AUHFTRA,B001UQ7042,5.0,1628618305674,Video Games,Madden NFL 10 - Playstation 3,"[Product Description, Every Sunday in the NFL,...","[Video Games, Legacy Systems, PlayStation Syst...",19.99,18453,4472,"[-1, -1, -1, -1, -1, -1, 552, 3537, 2684, 255]"
96825,AEIOF6FTMTA5EV5GFUPWIUFSXYQA,B08MBMWTYR,5.0,1628619099125,Video Games,Rune Factory 4 - Nintendo 3DS,[Rune Factory 4 marks the return of the popula...,"[Video Games, Legacy Systems, Nintendo Systems...",59.88,14006,324,"[-1, -1, -1, -1, 1658, 3337, 1973, 1323, 155, ..."
161827,AEIO4SZ4VYDYULGKNW2DBUJF67RA,B0C37RBK2R,4.0,1628629065982,Video Games,Xbox Series S,"[Introducing the Xbox Series S, the smallest, ...",[],279.0,17751,238,"[-1, -1, -1, -1, -1, 4035, 3485, 3718, 2939, 2..."
161963,AHYYZI32KE37AI3HVZL7XL4RFYPA,B0B1PB5L93,5.0,1628640617102,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,19237,3726,"[-1, -1, -1, -1, 1927, 3496, 3109, 1357, 1975,..."


In [15]:
val_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
620,AEN7JFLQCURF54WR5OHY7HOWWMSQ,B08FC5TTBF,5.0,1628644724721,Video Games,Demon's Souls - PlayStation 5,[From Bluepoint Games comes a remake of the Pl...,"[Video Games, PlayStation 5, Games]",29.99,7415,988,"[-1, -1, -1, -1, 1703, 183, 2681, 2488, 627, 3..."
843,AELH2ZF5QSSIFBF6WXAZLCF7JIWA,B0C6DH316S,2.0,1628653733506,Computers,Logitech G PRO X Wireless Lightspeed Gaming He...,[],"[Video Games, PC, Accessories, Headsets]",253.82,1676,1546,"[-1, -1, -1, -1, 849, 2073, 2331, 1790, 3011, ..."
210,AGD4QHNPSC45XTUPSUE6TYQOF3WQ,B0BN5DC36N,5.0,1628679010802,Computers,Seagate Horizon Forbidden West Limited Edition...,[Discover new worlds with the officially-licen...,"[Video Games, Legacy Systems, PlayStation Syst...",89.99,14105,1600,"[2555, 2792, 1193, 1155, 3427, 2042, 2455, 332..."
514,AFMOSTKHH2HFLI35E3YMI7GLYDCQ,B07KRWJCQW,5.0,1628687441776,Video Games,$40 Xbox Gift Card [Digital Code],[Buy an Xbox Gift Card for yourself or a frien...,"[Video Games, Online Game Services, Xbox Live,...",40.0,3315,2133,"[-1, -1, 1992, 3960, 2081, 1913, 3989, 591, 45..."
938,AGK34QNFABMBLRESDKG2VRC3VIIQ,B0BL65X86R,5.0,1628702768435,Video Games,$25 PlayStation Store Gift Card [Digital Code],[Redeem against anything on PlayStation Store....,"[Video Games, Online Game Services, PlayStatio...",25.0,15589,1826,"[3130, 1987, 793, 1346, 3087, 320, 4158, 2475,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
49,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,B08F4C6HCD,5.0,1657816667680,Video Games,Legend of Zelda Link's Awakening - Nintendo Sw...,"[“Castaway, you should know the truth!” As Lin...","[Video Games, Nintendo Switch, Games]",59.88,9944,3714,"[1815, 4445, 4013, 1742, 4130, 1565, 1252, 945..."
240,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,B00DBDPOZ4,5.0,1657855227062,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,11860,2099,"[-1, -1, -1, -1, -1, 3122, 4550, 2203, 1331, 1..."
661,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,18592,3726,"[-1, -1, -1, 3575, 3331, 2684, 4352, 3393, 205..."
237,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B00ZJBSBD8,5.0,1657945454164,Video Games,Trackmania Turbo-Nla,[Step into the wild car fantasy world of Track...,"[Video Games, PlayStation 4, Games]",13.68,2133,4298,"[4328, 4344, 3319, 2180, 2315, 3243, 1797, 979..."


In [16]:
train_features_df.to_parquet("../data/train_features.parquet", index=False)
val_features_df.to_parquet("../data/val_features.parquet", index=False)