In [1]:
import os
import sys
from pydantic import BaseModel
import pandas as pd

sys.path.insert(0, '..')

# Controller

In [2]:
class Args(BaseModel):
    run_name: str = '062-medium-rich-dataset'
    testing: bool = True
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'
    
    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "062-medium-rich-dataset",
  "testing": true,
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/062-medium-rich-dataset",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "sequence_length": 10
}


# Load data

In [3]:
from src.id_mapper import IDMapper
from src.train_utils import map_indice
from datasets import load_dataset

In [4]:
metadata_raw = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Video_Games", trust_remote_code=True)
metadata_raw_df = metadata_raw['full'].to_pandas()
metadata_raw_df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Video Games,Dash 8-300 Professional Add-On,5.0,1,[Features Dash 8-300 and 8-Q300 ('Q' rollout l...,[The Dash 8-300 Professional Add-On lets you p...,,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Aerosoft,"[Video Games, PC, Games]","{""Pricing"": ""The strikethrough price is the Li...",B000FH0MHO,,,
1,Video Games,Phantasmagoria: A Puzzle of Flesh,4.1,18,[Windows 95],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Sierra,"[Video Games, PC, Games]","{""Best Sellers Rank"": {""Video Games"": 137612, ...",B00069EVOG,,,
2,Video Games,NBA 2K17 - Early Tip Off Edition - PlayStation 4,4.3,223,[The #1 rated NBA video game simulation series...,[Following the record-breaking launch of NBA 2...,58.0,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['NBA 2K17 - Kobe: Haters vs Players...,2K,"[Video Games, PlayStation 4, Games]","{""Release date"": ""September 16, 2016"", ""Best S...",B00Z9TLVK0,,,
3,Video Games,Nintendo Selects: The Legend of Zelda Ocarina ...,4.9,22,[Authentic Nintendo Selects: The Legend of Zel...,[],37.42,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Amazon Renewed,"[Video Games, Legacy Systems, Nintendo Systems...","{""Best Sellers Rank"": {""Video Games"": 51019, ""...",B07SZJZV88,,,
4,Video Games,Thrustmaster Elite Fitness Pack for Nintendo Wii,3.0,3,"[Includes (9) Total Accessories, Pedometer, Wi...",[The Thrustmaster Motion Plus Elite Fitness Pa...,,"{'hi_res': [None, None, None, None, None, None...","{'title': [], 'url': [], 'user_id': []}",THRUSTMASTER,"[Video Games, Legacy Systems, Nintendo Systems...","{""Release date"": ""November 1, 2009"", ""Pricing""...",B002WH4ZJG,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137264,,Story of Seasons: Pioneers Of Olive Town (Nint...,4.5,397,[A wild world of discovery - tame the wilderne...,"[Product Description, Inspired by Tales of you...",31.04,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Marvelous Europe,"[Video Games, Nintendo Switch, Games]","{""Release date"": ""March 26, 2021"", ""Best Selle...",B09XQJS4CZ,,,
137265,Video Games,MotoGP 18 (PC DVD) UK IMPORT REGION FREE,4.0,1,[Brand new game engine - MotoGP18 has been reb...,[Become the champion of the 2018 MotoGP Season...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Milestone,"[Video Games, Game Genre of the Month]","{""Pricing"": ""The strikethrough price is the Li...",B07DGPTGNV,,,
137266,Cell Phones & Accessories,Century Accessory Soft Silicone Protective Ski...,2.9,19,"[Easy access to all buttons, controls and port...",[This soft case cover will add a splash of col...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Century Accessory,"[Video Games, Legacy Systems, Xbox Systems, Xb...","{""Package Dimensions"": ""2.76 x 2.76 x 0.2 inch...",B00HUWCQBW,,,
137267,,Hasbro Interactive Mr. Potato Head Activity Pa...,3.9,5,[],"[Amazon.com, Everyone's favorite master-of-dis...",,"{'hi_res': [None, 'https://m.media-amazon.com/...","{'title': [], 'url': [], 'user_id': []}",Hasbro,"[Video Games, PC, Games]","{""Release date"": ""July 24, 1999"", ""Best Seller...",B00002S9MH,,,


In [5]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")

In [6]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
54,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0050SVNZ8,4.0,1321885664000
55,AHATA6X6MYTC3VNBFJ3WIYVK257A,B00LZVNWIA,4.0,1408233606000
62,AHATA6X6MYTC3VNBFJ3WIYVK257A,B074RNL1RX,5.0,1511753174174
63,AHATA6X6MYTC3VNBFJ3WIYVK257A,B089QYP649,5.0,1531092820696
69,AHATA6X6MYTC3VNBFJ3WIYVK257A,B07DHNX18W,4.0,1604348335046
...,...,...,...,...
736577,AG6SXILM23AQQDRKFDI3D3IPGXPQ,B001T0HVGQ,4.0,1197406399000
736578,AG6SXILM23AQQDRKFDI3D3IPGXPQ,B001EBBGTS,5.0,1204415647000
736579,AG6SXILM23AQQDRKFDI3D3IPGXPQ,B000OCXK6A,4.0,1208216992000
736580,AG6SXILM23AQQDRKFDI3D3IPGXPQ,B001G6062E,4.0,1214532340000


In [7]:
user_ids = train_df[args.user_col].values
item_ids = train_df[args.item_col].values
unique_user_ids = list(set(user_ids))
unique_item_ids = list(set(item_ids))
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [8]:
train_df = train_df.pipe(map_indice, idm, args.user_col, args.item_col)
val_df = val_df.pipe(map_indice, idm, args.user_col, args.item_col)

In [9]:
idm.save("../data/idm.json")
idm = IDMapper().load("../data/idm.json")

# Item features

In [10]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline

In [11]:
# Custom function to reshape data from 1D to 2D
def reshape_2d_to_1d(X):
    return np.array(X).reshape(-1)

def flatten_string_array_col(X):
    return np.array(['\n'.join(x) for x in X])

def tfidf_pipeline_steps():
    steps = [
        ('impute', SimpleImputer(strategy='constant', fill_value='')),
        ('reshape', FunctionTransformer(reshape_2d_to_1d, validate=False)),
        ('tfidf', TfidfVectorizer(min_df=5, max_features=500, ngram_range=(1, 2)))
    ]
    return steps

def description_pipeline_steps():
    steps = [
        ('flatten_string_array_col', FunctionTransformer(flatten_string_array_col, validate=False)),
        ('tfidf', TfidfVectorizer(min_df=5, max_features=500, ngram_range=(1, 2)))
    ]
    return steps

def tokenizer(s):
    return s.split('\n')

def categories_pipeline_steps():
    steps = [
        ('flatten_string_array_col', FunctionTransformer(flatten_string_array_col, validate=False)),
        ('tfidf', CountVectorizer(tokenizer=tokenizer, token_pattern=None))
    ]
    return steps

data = pd.Series(["from 14.99", "14.99", "price: 9.99", "20 dollars", "none"])
price_pattern = r'\b((?:\d+\.\d*)|(?:\d+))\b'
display(data.str.extract(price_pattern))

def price_parse_dtype(series, pattern):
    return series.str.extract(pattern).astype(float)

def price_pipeline_steps(price_pattern):
    steps = [
        ('extract_price', FunctionTransformer(price_parse_dtype, kw_args=dict(pattern=price_pattern), validate=False)),
        ('impute', SimpleImputer(strategy='constant', fill_value=0)),
        ('min_max_scale', MinMaxScaler())
    ]
    return steps

Unnamed: 0,0
0,14.99
1,14.99
2,9.99
3,20.0
4,


In [12]:
tfm = [
    ('main_category', OneHotEncoder(handle_unknown='ignore'), ['main_category']),  # One-hot encoding for categorical field
    ('title', Pipeline(tfidf_pipeline_steps()), ['title']),  # TF-IDF vectorizer for text field
    ('description', Pipeline(description_pipeline_steps()), 'description'),  # TF-IDF vectorizer for another text field
    ('categories', Pipeline(categories_pipeline_steps()), 'categories'),  # Count Vectorizer for multi-label categorical
    ('price', Pipeline(price_pipeline_steps(price_pattern)), 'price')  # Normalizing price
]
cols = [x[0] for x in tfm]
cols

['main_category', 'title', 'description', 'categories', 'price']

In [13]:
train_features_df = pd.merge(train_df, metadata_raw_df[['parent_asin'] + cols], how='left', on='parent_asin')
val_features_df = pd.merge(val_df, metadata_raw_df[['parent_asin'] + cols], how='left', on='parent_asin')
train_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price
0,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0050SVNZ8,4.0,1321885664000,14992,3272,Video Games,Amazon Basics Carrying Case for Nintendo - New...,[],"[Video Games, Legacy Systems, Nintendo Systems...",
1,AHATA6X6MYTC3VNBFJ3WIYVK257A,B00LZVNWIA,4.0,1408233606000,14992,645,Computers,Logitech G402 Hyperion Fury FPS Gaming Mouse,[Logitech G402 Hyperion Fury FPS Gaming Mouse],"[Video Games, PC, Accessories, Gaming Mice]",
2,AHATA6X6MYTC3VNBFJ3WIYVK257A,B074RNL1RX,5.0,1511753174174,14992,4240,Video Games,Razer Wolverine Ultimate Officially Licensed X...,[Play anywhere with the Razer Wolverine Ultima...,"[Video Games, PC, Accessories, Controllers]",64.98
3,AHATA6X6MYTC3VNBFJ3WIYVK257A,B089QYP649,5.0,1531092820696,14992,459,Video Games,Turtle Beach Stealth 600 Wireless Surround Sou...,[The Turtle Beach Stealth 600 is the latest wi...,"[Video Games, PlayStation 4, Accessories, Head...",168.75
4,AHATA6X6MYTC3VNBFJ3WIYVK257A,B07DHNX18W,4.0,1604348335046,14992,4564,Computers,Razer Huntsman Elite Gaming Keyboard: Fast Key...,[Introduces the new Razer Opto-Mechanical swit...,"[Video Games, PC, Accessories, Gaming Keyboards]",219.99
...,...,...,...,...,...,...,...,...,...,...,...
170040,AG6SXILM23AQQDRKFDI3D3IPGXPQ,B001T0HVGQ,4.0,1197406399000,2262,3471,Video Games,Xbox 360 Elite System Console Includes 120GB H...,"[Product Description, Xbox 360 sets a new pace...","[Video Games, Legacy Systems, Xbox Systems, Xb...",
170041,AG6SXILM23AQQDRKFDI3D3IPGXPQ,B001EBBGTS,5.0,1204415647000,2262,1417,Video Games,Quake 4 - PC,"[Product Description, id Software’s QUAKE 4, d...","[Video Games, PC, Games]",32.99
170042,AG6SXILM23AQQDRKFDI3D3IPGXPQ,B000OCXK6A,4.0,1208216992000,2262,1902,Video Games,Frontlines: Fuel of War - Xbox 360,[Frontlines: Fuel of War is an open-world Firs...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",20.8
170043,AG6SXILM23AQQDRKFDI3D3IPGXPQ,B001G6062E,4.0,1214532340000,2262,1221,Video Games,Prey - Xbox 360,"[From the Manufacturer, Tommy is a Cherokee ga...","[Video Games, Legacy Systems, Xbox Systems, Xb...",31.9


In [14]:
# Define preprocessing steps for each column
preprocessing_pipeline = ColumnTransformer(
    transformers=tfm,
    remainder='drop'  # Drop any columns not specified in transformers
)

# Create a pipeline object
item_metadata_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline)
])

# Fit the pipeline
# Drop duplicated item so that the Pipeline only fit the unique item features
fit_df = train_features_df.drop_duplicates(subset=[args.item_col])
item_metadata_pipeline.fit(fit_df)

# Transform the data (useful for training)
transformed_item_metadata = item_metadata_pipeline.transform(train_features_df)

# For demonstration, print the shape of the transformed features and a few rows
print(f"Transformed Item Metadata Shape: {transformed_item_metadata.shape}")

Transformed Item Metadata Shape: (170045, 1155)


## Persist

In [30]:
import dill
import pickle

In [31]:
with open('../data/item_metadata_pipeline.dill', 'wb') as f:
    dill.dump(item_metadata_pipeline, f)

In [32]:
with open('../data/item_metadata_pipeline.dill', 'rb') as f:
    item_metadata_pipeline = dill.load(f)

# Item sequence

## Test implementation

In [17]:
from src.sequence_utils import generate_item_sequences

# Sample DataFrame
data = {
    'user_indices': [0, 0, 1, 1, 1],
    'item_indices': [0, 1, 2, 3, 4],
    'timestamp': [0, 1, 2, 3, 4],
    'ratings': [1, 4, 5, 3, 2]
}

df = pd.DataFrame(data)

# Generate the item sequences
df_with_sequences = generate_item_sequences(
    df,
    user_col='user_indices',
    item_col='item_indices',
    timestamp_col='timestamp',
    sequence_length=3,
    padding=True,
    padding_value=-1
)

df_with_sequences

Unnamed: 0,user_indices,item_indices,timestamp,ratings,item_sequence
0,0,0,0,1,"[-1.0, -1.0, -1.0]"
1,0,1,1,4,"[-1, -1, 0]"
2,1,2,2,5,"[-1.0, -1.0, -1.0]"
3,1,3,3,3,"[-1, -1, 2]"
4,1,4,4,2,"[-1, 2, 3]"


## Run with real data

In [18]:
full_df = pd.concat(
    [
        train_features_df.assign(source='train'),
        val_features_df.assign(source='val')
    ], axis=0
)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,source
0,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0050SVNZ8,4.0,1321885664000,14992,3272,Video Games,Amazon Basics Carrying Case for Nintendo - New...,[],"[Video Games, Legacy Systems, Nintendo Systems...",,train
1,AHATA6X6MYTC3VNBFJ3WIYVK257A,B00LZVNWIA,4.0,1408233606000,14992,645,Computers,Logitech G402 Hyperion Fury FPS Gaming Mouse,[Logitech G402 Hyperion Fury FPS Gaming Mouse],"[Video Games, PC, Accessories, Gaming Mice]",,train
2,AHATA6X6MYTC3VNBFJ3WIYVK257A,B074RNL1RX,5.0,1511753174174,14992,4240,Video Games,Razer Wolverine Ultimate Officially Licensed X...,[Play anywhere with the Razer Wolverine Ultima...,"[Video Games, PC, Accessories, Controllers]",64.98,train
3,AHATA6X6MYTC3VNBFJ3WIYVK257A,B089QYP649,5.0,1531092820696,14992,459,Video Games,Turtle Beach Stealth 600 Wireless Surround Sou...,[The Turtle Beach Stealth 600 is the latest wi...,"[Video Games, PlayStation 4, Accessories, Head...",168.75,train
4,AHATA6X6MYTC3VNBFJ3WIYVK257A,B07DHNX18W,4.0,1604348335046,14992,4564,Computers,Razer Huntsman Elite Gaming Keyboard: Fast Key...,[Introduces the new Razer Opto-Mechanical swit...,"[Video Games, PC, Accessories, Gaming Keyboards]",219.99,train
...,...,...,...,...,...,...,...,...,...,...,...,...
944,AELRDP5MCGSCANM6GWUXAMBN75LQ,B009AGXH64,5.0,1654280616536,18778,1711,Video Games,Nintendo Wii U Console - 32GB Black Deluxe Set,[Wii U is the next great gaming console from N...,"[Video Games, Legacy Systems, Nintendo Systems...",199.99,val
945,AFF5MP52H46DQM63YYLULLCEYAVQ,B08DF248LD,5.0,1630110810552,12516,647,Video Games,Xbox Core Wireless Controller – Carbon Black,[Experience the modernized design of the Xbox ...,[],45.5,val
946,AG25CXR2DXZV62WNVA46GAF2BL2Q,B08LT6PT1X,5.0,1638647139059,4409,946,Video Games,Xbox Elite Wireless Controller Series 2 – Black,[Experience the Xbox Elite Wireless Controller...,"[Video Games, Xbox One, Accessories, Controllers]",144.99,val
947,AEOY2365QPPEVDTOXL6N7ZA4NSAA,B00PDRZG9U,5.0,1628820275218,18863,2839,Video Games,Code Name: S.T.E.A.M.,"[Launch S.T.E.A.M., an elite team of steam-pow...","[Video Games, Legacy Systems, Nintendo Systems...",12.99,val


In [19]:
df_with_sequences = generate_item_sequences(
    full_df,
    user_col=args.user_col,
    item_col="item_indice",
    timestamp_col=args.timestamp_col,
    sequence_length=args.sequence_length,
    padding=True,
    padding_value=-1
)

In [20]:
with pd.option_context("display.max_colwidth", None):
    display(df_with_sequences[[args.user_col, 'item_indice', 'item_sequence']])

Unnamed: 0,user_id,item_indice,item_sequence
31239,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,1627,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
103639,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,4574,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
147159,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,2452,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
129526,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,1214,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]"
129527,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,847,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1214]"
...,...,...,...
49,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,1618,"[1137, 2285, 1696, 329, 3287, 1434, 3235, 3649, 450, 4560]"
240,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,116,"[-1, -1, -1, -1, -1, 320, 3342, 4354, 3523, 1377]"
661,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,1750,"[-1, -1, -1, 836, 1252, 364, 566, 483, 1590, 371]"
237,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,689,"[1623, 4019, 1097, 1481, 4197, 4292, 1602, 1304, 3268, 4345]"


In [24]:
user_id = df_with_sequences.sample(n=1)[args.user_col].values[0]

(
    df_with_sequences
    .loc[lambda df: df[args.user_col].eq(user_id)]
    .sort_values(args.timestamp_col)
    [[args.user_col, args.timestamp_col, 'item_indice', 'item_sequence']]
    .tail(10)
)

Unnamed: 0,user_id,timestamp,item_indice,item_sequence
131124,AHLQFTLALRCJOOAFTH5GBMDJLAKA,1134162372000,3549,"[-1, -1, -1, -1, -1, -1, -1, -1, 3005, 652]"
131125,AHLQFTLALRCJOOAFTH5GBMDJLAKA,1134163239000,4584,"[-1, -1, -1, -1, -1, -1, -1, 3005, 652, 3549]"
131126,AHLQFTLALRCJOOAFTH5GBMDJLAKA,1134172561000,1782,"[-1, -1, -1, -1, -1, -1, 3005, 652, 3549, 4584]"
131127,AHLQFTLALRCJOOAFTH5GBMDJLAKA,1134173444000,464,"[-1, -1, -1, -1, -1, 3005, 652, 3549, 4584, 1782]"
131128,AHLQFTLALRCJOOAFTH5GBMDJLAKA,1134307550000,3098,"[-1, -1, -1, -1, 3005, 652, 3549, 4584, 1782, ..."
131129,AHLQFTLALRCJOOAFTH5GBMDJLAKA,1136684319000,960,"[-1, -1, -1, 3005, 652, 3549, 4584, 1782, 464,..."
131130,AHLQFTLALRCJOOAFTH5GBMDJLAKA,1136684878000,4243,"[-1, -1, 3005, 652, 3549, 4584, 1782, 464, 309..."
131131,AHLQFTLALRCJOOAFTH5GBMDJLAKA,1136685475000,4079,"[-1, 3005, 652, 3549, 4584, 1782, 464, 3098, 9..."
131132,AHLQFTLALRCJOOAFTH5GBMDJLAKA,1136686380000,2150,"[3005, 652, 3549, 4584, 1782, 464, 3098, 960, ..."
131133,AHLQFTLALRCJOOAFTH5GBMDJLAKA,1143909764000,1567,"[652, 3549, 4584, 1782, 464, 3098, 960, 4243, ..."


# Persist

In [25]:
train_features_df = df_with_sequences.loc[lambda df: df['source'].eq('train')].drop(columns=['source'])
val_features_df = df_with_sequences.loc[lambda df: df['source'].eq('val')].drop(columns=['source'])

In [26]:
train_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
31239,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,B00001IVB4,5.0,942965209000,9372,1627,Video Games,Sim Theme Park - PC,[],"[Video Games, PC, Games]",35.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
103639,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,B00002NDRY,3.0,947856017000,10047,4574,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
147159,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,18020,2452,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
129526,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B001EYUWY0,5.0,952016747000,10065,1214,Video Games,Unreal Tournament - PlayStation 2,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, PlayStation Syst...",41.53,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
129527,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B00001KUII,5.0,952143204000,10065,847,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1214]"
...,...,...,...,...,...,...,...,...,...,...,...,...
69639,AHBJMDNN464YPLDJI63F4AUHFTRA,B001UQ7042,5.0,1628618305674,10059,4102,Video Games,Madden NFL 10 - Playstation 3,"[Product Description, Every Sunday in the NFL,...","[Video Games, Legacy Systems, PlayStation Syst...",19.99,"[-1, -1, -1, -1, -1, -1, 188, 3601, 364, 3576]"
96825,AEIOF6FTMTA5EV5GFUPWIUFSXYQA,B08MBMWTYR,5.0,1628619099125,14713,908,Video Games,Rune Factory 4 - Nintendo 3DS,[Rune Factory 4 marks the return of the popula...,"[Video Games, Legacy Systems, Nintendo Systems...",59.88,"[-1, -1, -1, -1, 2480, 2000, 1125, 338, 4483, ..."
161827,AEIO4SZ4VYDYULGKNW2DBUJF67RA,B0C37RBK2R,4.0,1628629065982,6800,2187,Video Games,Xbox Series S,"[Introducing the Xbox Series S, the smallest, ...",[],279.0,"[-1, -1, -1, -1, -1, 4299, 312, 586, 2941, 755]"
161963,AHYYZI32KE37AI3HVZL7XL4RFYPA,B0B1PB5L93,5.0,1628640617102,4220,1750,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, -1, 3208, 2664, 858, 2158, 3610, ..."


In [27]:
val_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
620,AEN7JFLQCURF54WR5OHY7HOWWMSQ,B08FC5TTBF,5.0,1628644724721,20335,1588,Video Games,Demon's Souls - PlayStation 5,[From Bluepoint Games comes a remake of the Pl...,"[Video Games, PlayStation 5, Games]",29.99,"[-1, -1, -1, -1, 4081, 1204, 2257, 4664, 1769,..."
843,AELH2ZF5QSSIFBF6WXAZLCF7JIWA,B0C6DH316S,2.0,1628653733506,17960,2156,Computers,Logitech G PRO X Wireless Lightspeed Gaming He...,[],"[Video Games, PC, Accessories, Headsets]",253.82,"[-1, -1, -1, -1, 2898, 671, 4036, 3391, 1374, ..."
210,AGD4QHNPSC45XTUPSUE6TYQOF3WQ,B0BN5DC36N,5.0,1628679010802,4370,926,Computers,Seagate Horizon Forbidden West Limited Edition...,[Discover new worlds with the officially-licen...,"[Video Games, Legacy Systems, PlayStation Syst...",89.99,"[4367, 2349, 4045, 873, 214, 1157, 862, 4059, ..."
514,AFMOSTKHH2HFLI35E3YMI7GLYDCQ,B07KRWJCQW,5.0,1628687441776,6631,3642,Video Games,$40 Xbox Gift Card [Digital Code],[Buy an Xbox Gift Card for yourself or a frien...,"[Video Games, Online Game Services, Xbox Live,...",40.0,"[-1, -1, 1227, 218, 2777, 2286, 4345, 2093, 33..."
938,AGK34QNFABMBLRESDKG2VRC3VIIQ,B0BL65X86R,5.0,1628702768435,19461,4205,Video Games,$25 PlayStation Store Gift Card [Digital Code],[Redeem against anything on PlayStation Store....,"[Video Games, Online Game Services, PlayStatio...",25.0,"[1026, 1445, 1600, 4195, 2094, 2927, 4619, 186..."
...,...,...,...,...,...,...,...,...,...,...,...,...
49,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,B08F4C6HCD,5.0,1657816667680,14453,1618,Video Games,Legend of Zelda Link's Awakening - Nintendo Sw...,"[“Castaway, you should know the truth!” As Lin...","[Video Games, Nintendo Switch, Games]",59.88,"[1137, 2285, 1696, 329, 3287, 1434, 3235, 3649..."
240,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,B00DBDPOZ4,5.0,1657855227062,19653,116,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,"[-1, -1, -1, -1, -1, 320, 3342, 4354, 3523, 1377]"
661,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,2970,1750,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, 836, 1252, 364, 566, 483, 1590, 371]"
237,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B00ZJBSBD8,5.0,1657945454164,852,689,Video Games,Trackmania Turbo-Nla,[Step into the wild car fantasy world of Track...,"[Video Games, PlayStation 4, Games]",13.68,"[1623, 4019, 1097, 1481, 4197, 4292, 1602, 130..."


In [28]:
train_features_df.to_parquet("../data/train_features.parquet", index=False)
val_features_df.to_parquet("../data/val_features.parquet", index=False)