In [1]:
import os
import sys
from pydantic import BaseModel
import pandas as pd

sys.path.insert(0, '..')

# Controller

In [2]:
class Args(BaseModel):
    run_name: str = '056-small-rich-dataset'
    testing: bool = True
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'
    
    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "056-small-rich-dataset",
  "testing": true,
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/056-small-rich-dataset",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "sequence_length": 10
}


# Load data

In [3]:
from src.id_mapper import IDMapper
from src.train_utils import map_indice

In [4]:
train_df = pd.read_parquet("../data/train_item_features.parquet")
val_df = pd.read_parquet("../data/val_item_features.parquet")

In [5]:
user_ids = train_df[args.user_col].values
item_ids = train_df[args.item_col].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [6]:
train_df = train_df.pipe(map_indice, idm, args.user_col, args.item_col)
val_df = val_df.pipe(map_indice, idm, args.user_col, args.item_col)

In [7]:
full_df = pd.concat([train_df.assign(source='train'), val_df.assign(source='val')], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,source
0,AHV6QCNBJNSGLATP56JAWJ3C4G2A,B019WRM1IA,5.0,1451860309000,All Electronics,Microsoft Xbox 360 Wired Controller for Window...,"[Product Description, Precisely what you need ...","[Video Games, Legacy Systems, Xbox Systems, Xb...",67.83,4091,222,train
1,AHV6QCNBJNSGLATP56JAWJ3C4G2A,B08DHTZNNF,4.0,1538135312132,Computers,"Razer Mamba Elite Wired Gaming Mouse: 16,000 D...",[The Razer Mamba Elite features our acclaimed ...,"[Video Games, Legacy Systems, PlayStation Syst...",36.95,4091,628,train
2,AHV6QCNBJNSGLATP56JAWJ3C4G2A,B0C3MZ128V,5.0,1554755712398,Computers,Razer BlackWidow TE Chroma v2 TKL Tenkeyless M...,[The Razer BlackWidow Tournament Edition Chrom...,"[Video Games, PC, Accessories, Gaming Keyboards]",109.99,4091,1590,train
3,AHV6QCNBJNSGLATP56JAWJ3C4G2A,B0BL3CW73P,4.0,1556830735236,Computers,"Logitech G332 Wired Gaming Headset, Rotating L...",[logitech G332 gaming headset lets you hear mo...,"[Video Games, PC, Accessories, Headsets]",29.99,4091,816,train
4,AHV6QCNBJNSGLATP56JAWJ3C4G2A,B07SBX48TY,4.0,1569958593706,Video Games,PowerA Spectra Enhanced Illuminated Wired Cont...,[Spectra has all the great features of our pop...,"[Video Games, Xbox One, Accessories, Controlle...",60.36,4091,1773,train
...,...,...,...,...,...,...,...,...,...,...,...,...
392,AGK34QNFABMBLRESDKG2VRC3VIIQ,B0BL65X86R,5.0,1628702768435,Video Games,$25 PlayStation Store Gift Card [Digital Code],[Redeem against anything on PlayStation Store....,"[Video Games, Online Game Services, PlayStatio...",25.0,7213,1220,val
393,AE3NRCMFIBBA2XVODR47YYNLKRDA,B001EYUQC8,5.0,1636797586634,Video Games,007 Quantum Of Solace - Playstation 3,[James Bond is back to settle the score in Qua...,"[Video Games, Legacy Systems, PlayStation Syst...",44.49,2804,1382,val
394,AGL65D42J6ROKDETDZD45MR43CNQ,B09M6YFK7Y,5.0,1649894828982,Computers,Razer Basilisk X HyperSpeed Wireless Gaming Mo...,[Up To 450 Hr Battery Life: Lasts 450 Hours On...,"[Video Games, PC, Accessories, Gaming Mice]",,6979,2213,val
395,AFUYEENYZ5C7PRAX7HJ5QNJ5LOCA,B07CRX5X9T,2.0,1639466517299,Computers,UtechSmart Venus Pro RGB Wireless MMO Gaming M...,[],"[Video Games, PC, Accessories, Gaming Mice]",47.99,5755,1041,val


In [8]:
idm.save("../data/idm.json")
idm = IDMapper().load("../data/idm.json")

# Item sequence

## Test implementation

In [9]:
from src.sequence_utils import generate_item_sequences

# Sample DataFrame
data = {
    'user_indices': [0, 0, 1, 1, 1],
    'item_indices': [0, 1, 2, 3, 4],
    'timestamp': [0, 1, 2, 3, 4],
    'ratings': [1, 4, 5, 3, 2]
}

df = pd.DataFrame(data)

# Generate the item sequences
df_with_sequences = generate_item_sequences(
    df,
    user_col='user_indices',
    item_col='item_indices',
    timestamp_col='timestamp',
    sequence_length=3,
    padding=True,
    padding_value=-1
)

df_with_sequences

Unnamed: 0,user_indices,item_indices,timestamp,ratings,item_sequence
0,0,0,0,1,"[-1.0, -1.0, -1.0]"
1,0,1,1,4,"[-1, -1, 0]"
2,1,2,2,5,"[-1.0, -1.0, -1.0]"
3,1,3,3,3,"[-1, -1, 2]"
4,1,4,4,2,"[-1, 2, 3]"


## Run with real data

In [10]:
df_with_sequences = generate_item_sequences(
    full_df,
    user_col=args.user_col,
    item_col="item_indice",
    timestamp_col=args.timestamp_col,
    sequence_length=args.sequence_length,
    padding=True,
    padding_value=-1
)

In [11]:
with pd.option_context("display.max_colwidth", None):
    display(df_with_sequences[[args.user_col, 'item_indice', 'item_sequence']])

Unnamed: 0,user_id,item_indice,item_sequence
39247,AHBOFFMSOTEBSVHOOD232ZHFCEWQ,2587,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
24537,AGSZX3YVKUDHRIBKZUMYWM5KNXLQ,2083,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
76748,AGAHLNZNWZMCGFM552XFS6XRAJ5Q,188,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
76749,AGAHLNZNWZMCGFM552XFS6XRAJ5Q,8,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 188]"
81333,AEUJQYSQIJJ2RVKNTC7J5NLR2JPA,118,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
...,...,...,...
154,AG6MENO5OO7LRCH27J47ZODEZN6Q,2203,"[-1, -1, 2569, 1380, 1344, 1342, 2224, 1328, 568, 688]"
55,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,1347,"[691, 1944, 2013, 456, 395, 863, 2152, 2439, 2630, 1701]"
56,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,2469,"[1944, 2013, 456, 395, 863, 2152, 2439, 2630, 1701, 1347]"
32,AFDL3ZQE4ARYEEBBH2KAPMP4NSHQ,1580,"[-1, -1, -1, -1, -1, 1778, 745, 598, 1358, 684]"


In [12]:
(
    df_with_sequences
    .loc[lambda df: df['user_id'].eq('AFGMAHIIZCOPLZRHYSCSHNBYHKLA')].sort_values('timestamp')
    [[args.user_col, 'item_indice', 'item_sequence']]
)

Unnamed: 0,user_id,item_indice,item_sequence


# Persist

In [13]:
train_features_df = df_with_sequences.loc[lambda df: df['source'].eq('train')].drop(columns=['source'])
val_features_df = df_with_sequences.loc[lambda df: df['source'].eq('val')].drop(columns=['source'])

In [14]:
train_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
39247,AHBOFFMSOTEBSVHOOD232ZHFCEWQ,B00001IVRD,4.0,961125766000,Video Games,StarCraft Battle Chest - PC/Mac,"[Product description, StarCraft Battle Chest, ...","[Video Games, PC, Games]",81.88,5347,2587,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
24537,AGSZX3YVKUDHRIBKZUMYWM5KNXLQ,B001EYUPLK,2.0,963407426000,Video Games,Diablo 2: Collector's Edition,"[Amazon.com, In, Diablo II,, players return to...","[Video Games, PC, Games]",,4249,2083,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
76748,AGAHLNZNWZMCGFM552XFS6XRAJ5Q,B00000J9J9,5.0,963523537000,Video Games,Super Mario Bros. Deluxe,"[Product Description, Recapture the magic of t...","[Video Games, Legacy Systems, Nintendo Systems...",51.49,6257,188,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
76749,AGAHLNZNWZMCGFM552XFS6XRAJ5Q,B00002STEZ,4.0,963524433000,Video Games,Donkey Kong 64 - Nintendo 64,"[Product Description, Every so often a game en...","[Video Games, Legacy Systems, Nintendo Systems...",,6257,8,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 188]"
81333,AEUJQYSQIJJ2RVKNTC7J5NLR2JPA,B00000DMAX,4.0,965368820000,Video Games,Mario Kart 64,"[Product description, Three... Two... One... G...","[Video Games, Legacy Systems, Nintendo Systems...",68.99,8921,118,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
23650,AGIDOYPQBICZKEB4JXOHBAWTVLZQ,B095WDDH4N,1.0,1628360926517,Video Games,PowerA FUSION Pro Wired Controller for Xbox On...,[Get the competitive edge with the FUSION Pro ...,"[Video Games, Xbox One, Accessories, Controlle...",,8729,1302,"[-1, -1, -1, -1, -1, -1, 1154, 1343, 1026, 2611]"
58668,AHXGULVDCWQKTANFRWLCOPOKOBMQ,B001EYUNVC,4.0,1628461594076,Video Games,The Legend of Zelda: Twilight Princess,"[Product description, The Legend Of Zelda: Twi...","[Video Games, Legacy Systems, Nintendo Systems...",49.49,10254,361,"[-1, -1, -1, -1, -1, -1, 224, 2442, 1831, 1733]"
8460,AHRRITGXKEWV2XTSIG3OA2EO66AA,B07MWB5YJW,5.0,1628483465852,Video Games,HORI Game Card Case 24 for Nintendo Switch Off...,[Officially Licensed by Nintendo. Store & orga...,"[Video Games, Nintendo Switch, Accessories, Ca...",9.99,3552,1443,"[-1, -1, -1, 803, 324, 284, 1724, 63, 710, 1419]"
15572,AECVYUJXHQ4IEJ6M22QGVBULSZJQ,B0BVVTQ5JP,2.0,1628538995441,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,4502,1202,"[-1, -1, -1, -1, -1, -1, 2530, 1092, 2140, 1841]"


In [15]:
val_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
365,AELH2ZF5QSSIFBF6WXAZLCF7JIWA,B0C6DH316S,2.0,1628653733506,Computers,Logitech G PRO X Wireless Lightspeed Gaming He...,[],"[Video Games, PC, Accessories, Headsets]",253.82,2998,752,"[-1, -1, -1, -1, 1321, 2747, 1527, 1253, 1089,..."
392,AGK34QNFABMBLRESDKG2VRC3VIIQ,B0BL65X86R,5.0,1628702768435,Video Games,$25 PlayStation Store Gift Card [Digital Code],[Redeem against anything on PlayStation Store....,"[Video Games, Online Game Services, PlayStatio...",25.0,7213,1220,"[1263, 687, 1106, 279, 1722, 773, 2657, 1949, ..."
325,AF5WIG3GZRQ7L6NLPY2WBXD4TGSQ,B07Z9Z39ZW,5.0,1628746463751,Video Games,Witcher 3: Wild Hunt Complete Edition - Ninten...,[],"[Video Games, Nintendo Switch, Games]",41.54,9243,1141,"[576, 428, 367, 1088, 1002, 1013, 2319, 2589, ..."
358,AFVZNKZMULXWCFCOLJGM7O6LT63A,B00VPW0DYU,5.0,1628802786230,Video Games,Call of Duty: Advanced Warfare Atlas Pro Editi...,[],"[Video Games, Xbox One, Games]",144.59,41,164,"[-1, -1, 660, 1795, 2558, 305, 1915, 2222, 216..."
76,AHAIICWIZT6PYSS5QJNFYP6ZXLCA,B0B92WG8LS,1.0,1628811542081,Video Games,HORI Nintendo Switch HORIPAD Wired Controller ...,[Officially licensed by Nintendo. After three ...,"[Video Games, Nintendo Switch, Accessories, Co...",19.99,690,140,"[-1, -1, -1, 2515, 1219, 2490, 1764, 1552, 200..."
...,...,...,...,...,...,...,...,...,...,...,...,...
154,AG6MENO5OO7LRCH27J47ZODEZN6Q,B0728H6KPV,5.0,1657815635870,Cell Phones & Accessories,"Charger for Nintendo Switch, YCCTEAM Charger A...",[],"[Video Games, Legacy Systems, Nintendo Systems...",13.99,50,2203,"[-1, -1, 2569, 1380, 1344, 1342, 2224, 1328, 5..."
55,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,B0716CXJ1R,5.0,1657835528863,Video Games,Darksiders III - Collector's Edition - Xbox One,[],"[Video Games, Xbox One, Games]",149.99,4352,1347,"[691, 1944, 2013, 456, 395, 863, 2152, 2439, 2..."
56,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,B07D13QGXM,5.0,1657836034795,Video Games,Minecraft - Nintendo Switch,[Minecraft is a game about placing blocks and ...,"[Video Games, Nintendo Switch, Games]",29.99,4352,2469,"[1944, 2013, 456, 395, 863, 2152, 2439, 2630, ..."
32,AFDL3ZQE4ARYEEBBH2KAPMP4NSHQ,B0795GHTBC,5.0,1657910674213,All Electronics,ivoler [3 Pack Screen Protector Tempered Glass...,[],"[Video Games, Nintendo Switch, Accessories, Fa...",9.39,4522,1580,"[-1, -1, -1, -1, -1, 1778, 745, 598, 1358, 684]"


In [16]:
train_features_df.to_parquet("../data/train_features.parquet", index=False)
val_features_df.to_parquet("../data/val_features.parquet", index=False)