In [1]:
import os
import sys
from pydantic import BaseModel
import pandas as pd

sys.path.insert(0, '..')

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = True
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = '055-increase-sample-users-to-30K'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'
    
    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": true,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "055-increase-sample-users-to-30K",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/055-increase-sample-users-to-30K",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "sequence_length": 10
}


# Load data

In [3]:
from src.id_mapper import IDMapper
from src.train_utils import map_indice

In [4]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")

In [5]:
user_ids = train_df[args.user_col].values
item_ids = train_df[args.item_col].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [6]:
train_df = train_df.pipe(map_indice, idm, args.user_col, args.item_col)
val_df = val_df.pipe(map_indice, idm, args.user_col, args.item_col)

In [7]:
full_df = pd.concat([train_df.assign(source='train'), val_df.assign(source='val')], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,source
23,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,0920668372,5.0,1430056169000,13605,107078,train
24,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,1589255208,5.0,1443926150000,13605,67200,train
25,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,2764322836,5.0,1463967052000,13605,51036,train
26,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,2764330898,5.0,1489085694000,13605,135590,train
27,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,0062380761,5.0,1526591330983,13605,172958,train
...,...,...,...,...,...,...,...
425449,AGVLANGEASFXMR6VIQL5A7E7FARA,1401270727,1.0,1632619642935,29397,155780,val
425450,AGVLANGEASFXMR6VIQL5A7E7FARA,1779509510,5.0,1641951824343,29397,3916,val
425486,AFNXBSDR2QYHGIQ2GZK2WE7BZQIQ,1626347123,5.0,1654723852802,12109,185023,val
425660,AG2OONX4BPYUIAF4BY4LRVN4VCAA,1777488206,2.0,1646404539903,10333,145180,val


In [8]:
idm.save("../data/idm.json")
idm = IDMapper().load("../data/idm.json")

# Item sequence

## Test implementation

In [9]:
from src.sequence_utils import generate_item_sequences

# Sample DataFrame
data = {
    'user_indices': [0, 0, 1, 1, 1],
    'item_indices': [0, 1, 2, 3, 4],
    'timestamp': [0, 1, 2, 3, 4],
    'ratings': [1, 4, 5, 3, 2]
}

df = pd.DataFrame(data)

# Generate the item sequences
df_with_sequences = generate_item_sequences(
    df,
    user_col='user_indices',
    item_col='item_indices',
    timestamp_col='timestamp',
    sequence_length=3,
    padding=True,
    padding_value=-1
)

df_with_sequences

Unnamed: 0,user_indices,item_indices,timestamp,ratings,item_sequence
0,0,0,0,1,"[-1.0, -1.0, -1.0]"
1,0,1,1,4,"[-1, -1, 0]"
2,1,2,2,5,"[-1.0, -1.0, -1.0]"
3,1,3,3,3,"[-1, -1, 2]"
4,1,4,4,2,"[-1, 2, 3]"


## Run with real data

In [10]:
df_with_sequences = generate_item_sequences(
    full_df,
    user_col=args.user_col,
    item_col="item_indice",
    timestamp_col=args.timestamp_col,
    sequence_length=args.sequence_length,
    padding=True,
    padding_value=-1
)

In [11]:
with pd.option_context("display.max_colwidth", None):
    display(df_with_sequences)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,source,item_sequence
3974366,AH7OMXSRNKMM3GF6PQGHQEU4XYAQ,0449208281,5.0,854697682000,4627,31034,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
4012198,AEVS5LDDBLZXXNAP66SBQFY5ZY3A,038097505X,5.0,871307181000,22546,193195,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
4012199,AEVS5LDDBLZXXNAP66SBQFY5ZY3A,0345311809,5.0,873311379000,22546,1640,train,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 193195]"
4012200,AEVS5LDDBLZXXNAP66SBQFY5ZY3A,014014739X,5.0,876331947000,22546,20797,train,"[-1, -1, -1, -1, -1, -1, -1, -1, 193195, 1640]"
1081177,AE6FP5GZNTBK6QKAGXFEDIJYO6MA,0446343552,5.0,876423124000,25431,183951,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
...,...,...,...,...,...,...,...,...
100808,AGYS63VGRKADXYMC57GEP6MG6W7Q,164845044X,5.0,1657997738446,13970,59763,val,"[51819, 123868, 37386, 85668, 146722, 3008, 166998, 18359, 56815, 19678]"
260949,AGNG7EAGWLOJPLH3AXKSJYFQIMJA,B002HMJZAA,5.0,1657997929759,23545,152753,val,"[88586, 79531, 190768, 92009, 78222, 52838, 139285, 162565, 87805, 31627]"
294,AFG6YQ3GOY7TVFKQ3SKDVS6Q6RDQ,B07R3QYGHY,4.0,1657998389024,20880,152734,val,"[-1, -1, -1, -1, 24006, 136264, 49218, 6292, 82726, 150950]"
118253,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B01D1LNYWK,5.0,1657999964843,14334,162235,val,"[-1, -1, -1, -1, -1, -1, 76258, 18569, 643, 171112]"


In [12]:
df_with_sequences.loc[lambda df: df['user_id'].eq('AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA')].sort_values('timestamp')

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,source,item_sequence
2394173,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,0609604724,5.0,1422632235000,14334,76258,train,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2394174,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,152350076X,5.0,1523466928907,14334,18569,train,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 76258]"
2394175,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B08FJ2L33W,5.0,1624302398231,14334,643,train,"[-1, -1, -1, -1, -1, -1, -1, -1, 76258, 18569]"
2394176,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B07CL2HDYD,5.0,1626054150855,14334,171112,train,"[-1, -1, -1, -1, -1, -1, -1, 76258, 18569, 643]"
118253,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B01D1LNYWK,5.0,1657999964843,14334,162235,val,"[-1, -1, -1, -1, -1, -1, 76258, 18569, 643, 17..."


# Persist

In [13]:
train_features_df = df_with_sequences.loc[lambda df: df['source'].eq('train')].drop(columns=['source'])
val_features_df = df_with_sequences.loc[lambda df: df['source'].eq('val')].drop(columns=['source'])

In [14]:
train_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
3974366,AH7OMXSRNKMM3GF6PQGHQEU4XYAQ,0449208281,5.0,854697682000,4627,31034,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4012198,AEVS5LDDBLZXXNAP66SBQFY5ZY3A,038097505X,5.0,871307181000,22546,193195,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4012199,AEVS5LDDBLZXXNAP66SBQFY5ZY3A,0345311809,5.0,873311379000,22546,1640,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 193195]"
4012200,AEVS5LDDBLZXXNAP66SBQFY5ZY3A,014014739X,5.0,876331947000,22546,20797,"[-1, -1, -1, -1, -1, -1, -1, -1, 193195, 1640]"
1081177,AE6FP5GZNTBK6QKAGXFEDIJYO6MA,0446343552,5.0,876423124000,25431,183951,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...
1493715,AG4S63VXGYJ2K4GK72KWRDAMOYLQ,0778389448,5.0,1628642557237,23742,156653,"[35686, 70117, 167206, 173005, 84788, 62675, 6..."
2867061,AEVGOD43QHFLLYQDJ3ULHCR7SRHA,B07QYY1NN5,5.0,1628643017117,7139,147175,"[187865, 71294, 46250, 175190, 61367, 59506, 1..."
4179132,AEJLY6U3ZNLMLEKL335XMYQHZJ3A,0228821509,5.0,1628643125407,8028,57185,"[-1, -1, 181296, 67681, 113756, 114701, 98870,..."
2854133,AHNO3QRJHUSFVW3Z5FGLCPTNPNYQ,0995507546,5.0,1628643153253,610,51360,"[-1, -1, -1, -1, -1, -1, -1, 21532, 176154, 18..."


In [15]:
val_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
7124,AE6JMAGKKXB3XWFTMMIPPHHJJCOA,B083RYQX1G,5.0,1628643785278,23411,13652,"[105365, 31426, 138458, 55459, 166024, 129077,..."
254264,AHNJTA332IXU4EBKXODY6TTXJ7YQ,1771682108,4.0,1628644118159,21898,98341,"[9052, 32987, 16332, 108422, 132834, 190618, 8..."
36393,AH25F2IC4B4SJ3ECHVTUYBXM77PQ,0679763880,5.0,1628645112126,20680,154391,"[-1, -1, -1, -1, -1, 50050, 17849, 155616, 175..."
166111,AFTNOHMVDU5HTTDLH6FYV4ZTAAEA,0316554766,5.0,1628645263328,22524,120683,"[-1, -1, -1, -1, 123385, 19770, 184551, 8875, ..."
388347,AGYX6U2NCON4S7PODNWB2UCDMEKQ,B08LVSC93V,5.0,1628645430352,28296,113079,"[71094, 182571, 36393, 91658, 126858, 123390, ..."
...,...,...,...,...,...,...,...
100808,AGYS63VGRKADXYMC57GEP6MG6W7Q,164845044X,5.0,1657997738446,13970,59763,"[51819, 123868, 37386, 85668, 146722, 3008, 16..."
260949,AGNG7EAGWLOJPLH3AXKSJYFQIMJA,B002HMJZAA,5.0,1657997929759,23545,152753,"[88586, 79531, 190768, 92009, 78222, 52838, 13..."
294,AFG6YQ3GOY7TVFKQ3SKDVS6Q6RDQ,B07R3QYGHY,4.0,1657998389024,20880,152734,"[-1, -1, -1, -1, 24006, 136264, 49218, 6292, 8..."
118253,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B01D1LNYWK,5.0,1657999964843,14334,162235,"[-1, -1, -1, -1, -1, -1, 76258, 18569, 643, 17..."


In [16]:
train_features_df.to_parquet("../data/train_features.parquet", index=False)
val_features_df.to_parquet("../data/val_features.parquet", index=False)