In [1]:
import os
import sys
from pydantic import BaseModel
import pandas as pd

sys.path.insert(0, '..')

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = True
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = '050-sequence'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'
    
    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": true,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "050-sequence",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/050-sequence",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "sequence_length": 10
}


# Load data

In [3]:
from src.id_mapper import IDMapper
from src.train_utils import map_indice

In [4]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")

In [5]:
user_ids = train_df[args.user_col].values
item_ids = train_df[args.item_col].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [6]:
train_df = train_df.pipe(map_indice, idm, args.user_col, args.item_col)
val_df = val_df.pipe(map_indice, idm, args.user_col, args.item_col)

In [7]:
full_df = pd.concat([train_df.assign(source='train'), val_df.assign(source='val')], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,source
23,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,0920668372,5.0,1430056169000,9178,27486,train
24,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,1589255208,5.0,1443926150000,9178,46447,train
25,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,2764322836,5.0,1463967052000,9178,23657,train
26,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,2764330898,5.0,1489085694000,9178,68285,train
27,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,0062380761,5.0,1526591330983,9178,24241,train
...,...,...,...,...,...,...,...
424800,AHXZ66ATLSPVIW5HC5OTNLYGBDTQ,1416542744,4.0,1645198331443,465,92391,val
424884,AHWBSG5WTNDC47SPUMJTWPIDZ7HQ,B08MQLJ99B,5.0,1629558239986,6416,91343,val
425194,AE5AXNZSQK6R5J2EXFUCFPDPSA6A,1643260448,2.0,1637475668742,9809,66053,val
425440,AFM4K7CAFB2KE6BHWQSS7KEHTWLA,0452282314,5.0,1643339582810,3923,61832,val


In [8]:
idm.save("../data/idm.json")
idm = IDMapper().load("../data/idm.json")

# Item sequence

## Test implementation

In [9]:
from src.sequence_utils import generate_item_sequences

# Sample DataFrame
data = {
    'user_indices': [0, 0, 1, 1, 1],
    'item_indices': [0, 1, 2, 3, 4],
    'timestamp': [0, 1, 2, 3, 4],
    'ratings': [1, 4, 5, 3, 2]
}

df = pd.DataFrame(data)

# Generate the item sequences
df_with_sequences = generate_item_sequences(
    df,
    user_col='user_indices',
    item_col='item_indices',
    timestamp_col='timestamp',
    sequence_length=3,
    padding=True,
    padding_value=-1
)

df_with_sequences

Unnamed: 0,user_indices,item_indices,timestamp,ratings,item_sequence
0,0,0,0,1,"[-1.0, -1.0, -1.0]"
1,0,1,1,4,"[-1, -1, 0]"
2,1,2,2,5,"[-1.0, -1.0, -1.0]"
3,1,3,3,3,"[-1, -1, 2]"
4,1,4,4,2,"[-1, 2, 3]"


## Run with real data

In [10]:
df_with_sequences = generate_item_sequences(
    full_df,
    user_col=args.user_col,
    item_col="item_indice",
    timestamp_col=args.timestamp_col,
    sequence_length=args.sequence_length,
    padding=True,
    padding_value=-1
)

In [11]:
with pd.option_context("display.max_colwidth", None):
    display(df_with_sequences)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,source,item_sequence
4328100,AFQFGIC62CA6X7B5WNYQJC3DQS6A,037376099X,5.0,878061365000,2021,18576,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
522498,AFJFQKVLBLJLGKHZYUHIDZLGVBDQ,1565922573,5.0,878680832000,2921,30312,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
522499,AFJFQKVLBLJLGKHZYUHIDZLGVBDQ,0449909433,4.0,879712608000,2921,6651,train,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 30312]"
694545,AFQGSL2NLM3XYV4VU5YCHQZEMFRA,0553571818,4.0,887759677000,6551,42520,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
694546,AFQGSL2NLM3XYV4VU5YCHQZEMFRA,014018869X,5.0,888091095000,6551,17257,train,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 42520]"
...,...,...,...,...,...,...,...,...
398518,AE7CC33RBTGEOQ2MBIAZDHXEAG7A,B08XQWFMK4,2.0,1657994280406,5841,82836,val,"[-1, -1, -1, -1, -1, -1, -1, -1, 21304, 54333]"
72467,AHRDEE3ZO5VMRWUK7CUILRWSTB7A,1629798266,5.0,1657996230659,9471,53402,val,"[-1, 77791, 13305, 34579, 58946, 40435, 28041, 62180, 34583, 2773]"
294,AFG6YQ3GOY7TVFKQ3SKDVS6Q6RDQ,B07R3QYGHY,4.0,1657998389024,7843,86266,val,"[-1, -1, -1, -1, 66104, 12441, 57040, 4640, 33429, 51703]"
118253,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B01D1LNYWK,5.0,1657999964843,6013,7431,val,"[-1, -1, -1, -1, -1, -1, 42180, 71861, 73082, 44827]"


In [20]:
df_with_sequences.loc[lambda df: df['user_id'].eq('AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA')].sort_values('timestamp')

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,source,item_sequence
2394173,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,0609604724,5.0,1422632235000,6013,42180,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2394174,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,152350076X,5.0,1523466928907,6013,71861,train,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 42180]"
2394175,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B08FJ2L33W,5.0,1624302398231,6013,73082,train,"[-1, -1, -1, -1, -1, -1, -1, -1, 42180, 71861]"
2394176,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B07CL2HDYD,5.0,1626054150855,6013,44827,train,"[-1, -1, -1, -1, -1, -1, -1, 42180, 71861, 73082]"
118253,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B01D1LNYWK,5.0,1657999964843,6013,7431,val,"[-1, -1, -1, -1, -1, -1, 42180, 71861, 73082, ..."


# Persist

In [13]:
train_features_df = df_with_sequences.loc[lambda df: df['source'].eq('train')].drop(columns=['source'])
val_features_df = df_with_sequences.loc[lambda df: df['source'].eq('val')].drop(columns=['source'])

In [14]:
train_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
4328100,AFQFGIC62CA6X7B5WNYQJC3DQS6A,037376099X,5.0,878061365000,2021,18576,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
522498,AFJFQKVLBLJLGKHZYUHIDZLGVBDQ,1565922573,5.0,878680832000,2921,30312,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
522499,AFJFQKVLBLJLGKHZYUHIDZLGVBDQ,0449909433,4.0,879712608000,2921,6651,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 30312]"
694545,AFQGSL2NLM3XYV4VU5YCHQZEMFRA,0553571818,4.0,887759677000,6551,42520,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
694546,AFQGSL2NLM3XYV4VU5YCHQZEMFRA,014018869X,5.0,888091095000,6551,17257,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 42520]"
...,...,...,...,...,...,...,...
5124146,AHFIKKCH7I4HDAFJOAH2U2TSVI6A,B006E9027M,5.0,1628641294816,1901,87457,"[73008, 2871, 32357, 21279, 68350, 81281, 4868..."
7807503,AE6MFCCWBCTEV7X2PGKSA5373RIQ,B015D4N7RA,3.0,1628641365295,7951,47559,"[35306, 23701, 57268, 85731, 86686, 75233, 905..."
4780096,AHUI4EHZ2F55LQQTKNSHAM3L4UKA,B07Y8HQ783,5.0,1628641444282,7313,53615,"[-1, -1, -1, 24575, 65556, 62132, 69675, 44991..."
4491500,AHYODSBIR26FFB4YCNO7Q5JT765A,0743246985,5.0,1628641576511,3926,40277,"[81343, 1958, 781, 91844, 66693, 87628, 24549,..."


In [15]:
val_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
388347,AGYX6U2NCON4S7PODNWB2UCDMEKQ,B08LVSC93V,5.0,1628645430352,7848,5071,"[27607, 70968, 11859, 27274, 76327, 2315, 1189..."
395362,AEFV2MQ4QLBPMIRC5C2KFNTWYCSQ,B095GJ4Q2Z,5.0,1628646638911,1131,87739,"[-1, -1, -1, -1, -1, 52219, 475, 38855, 65766,..."
389710,AELCPC75C7ICIRUUZH7S3Y2DHFOA,0062984934,4.0,1628649596144,728,53924,"[49738, 68294, 19226, 5531, 68471, 56322, 4089..."
330495,AFJGZX2FEYC4DNAVRIBKX73VYNEQ,B07WQPHD3M,5.0,1628650238248,6905,54209,"[49614, 40079, 80395, 54154, 33800, 4357, 4869..."
10383,AEUPP3NZDHQL5XUCIIK763EG6AWA,0152062688,4.0,1628652038773,7246,77975,"[-1, 65326, 9400, 70214, 91230, 15771, 76650, ..."
...,...,...,...,...,...,...,...
398518,AE7CC33RBTGEOQ2MBIAZDHXEAG7A,B08XQWFMK4,2.0,1657994280406,5841,82836,"[-1, -1, -1, -1, -1, -1, -1, -1, 21304, 54333]"
72467,AHRDEE3ZO5VMRWUK7CUILRWSTB7A,1629798266,5.0,1657996230659,9471,53402,"[-1, 77791, 13305, 34579, 58946, 40435, 28041,..."
294,AFG6YQ3GOY7TVFKQ3SKDVS6Q6RDQ,B07R3QYGHY,4.0,1657998389024,7843,86266,"[-1, -1, -1, -1, 66104, 12441, 57040, 4640, 33..."
118253,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B01D1LNYWK,5.0,1657999964843,6013,7431,"[-1, -1, -1, -1, -1, -1, 42180, 71861, 73082, ..."


In [16]:
train_features_df.to_parquet("../data/train_features.parquet", index=False)
val_features_df.to_parquet("../data/val_features.parquet", index=False)