# Negative sampling

More label data can tremendously help model to learn. In this notebook we would prepare negative samples for each user based on the unseen items. The unseen items are sampled based on how frequent they appear in the training dataset. This is an intentional choice to make the negative sample harder, hence potentially more useful. This would also force the model learn relevant patterns about user behaviors rather than biased by popularity.

# Set up

In [1]:
import os
import sys
from datetime import datetime

import numpy as np
import pandas as pd
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.data_prep_utils import handle_dtypes, parse_dt
from src.id_mapper import IDMapper
from src.sequence.negative_sampling import (
    add_features_to_neg_df,
    generate_negative_samples,
)

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-sequence-modeling"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-sequence-modeling",
  "notebook_persist_dp": "/Users/dvq/frostmourne/recsys-mvp/notebooks/data/000-sequence-modeling",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

In [4]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df)
neg_df = add_features_to_neg_df(df, neg_df, "user_indice", "timestamp")

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
neg_df.sort_values(["user_indice", args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
1,1,106,0,2
2,1,105,0,4
3,2,103,0,1
4,2,106,0,2
5,3,103,0,1
6,3,101,0,5


# Load data

In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")
idm = IDMapper().load("../data/idm.json")

In [8]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-10-20 00:33:17.478[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=1628630904765[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0).pipe(parse_dt).pipe(handle_dtypes)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,468,394,Video Games,World of Warcraft Battle Chest - (Obsolete),[Experience the World of Warcraft! World of Wa...,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,468,3760,Video Games,World of Warcraft Collector's Edition,"[From the Manufacturer, We're very excited to ...","[Video Games, PC, Games]",999.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,468,2164,Video Games,Advance Wars: Dual Strike - Nintendo DS,[Advance Wars: Dual Strike is the latest in th...,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,468,723,Video Games,Super Paper Mario,[The newest chapter of the Paper Mario story i...,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 394..."
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,468,3011,Video Games,The World Ends With You,"[Product description, Welcome to Shibuya. Wake...","[Video Games, Legacy Systems, Nintendo Systems]",99.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 394.0, 37..."
...,...,...,...,...,...,...,...,...,...,...,...,...
957,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,9989,853,Video Games,Darksiders III - Collector's Edition - Xbox One,[],"[Video Games, Xbox One, Games]",149.99,"[741, 4427, 664, 2712, 2086, 2299, 949, 611, 4..."
958,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,9989,4431,Video Games,Donkey Kong Country: Tropical Freeze - Nintend...,[Barrel-blast into a critically acclaimed Donk...,"[Video Games, Nintendo Switch, Games]",52.49,"[4427, 664, 2712, 2086, 2299, 949, 611, 4352, ..."
959,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,9989,3222,Video Games,Darksiders 2: Deathinitive Edition - Xbox One ...,"[What starts with War, ends in Death. Awakened...","[Video Games, Xbox One, Games]",14.99,"[664, 2712, 2086, 2299, 949, 611, 4352, 1900, ..."
960,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,4179,1645,Video Games,Mudrunner - American Wilds Edition - PlayStati...,[Mud Runner - American Wilds is the ultimate v...,"[Video Games, PlayStation 4, Games]",23.98,"[4317, 2926, 4329, 4188, 1203, 4197, 4447, 154..."


# Load item sequence from Feature Store

## Test load

In [10]:
user_id = full_df[args.user_col].iloc[0]
logger.info(f"Test load with {user_id=}")

entity_df = pd.DataFrame.from_dict(
    {
        "user_id": [user_id],
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
        ],
    }
)

features = [
    "user_rating_stats:user_rating_cnt_90d",
    "user_rating_stats:user_rating_avg_prev_rating_90d",
    "user_rating_stats:user_rating_list_10_recent_asin",
]

store = FeatureStore(
    repo_path="..", fs_yaml_file="../feature_store_offline_server.yaml"
)
training_df = store.get_historical_features(entity_df, features).to_df()
training_df

[32m2024-10-20 00:33:17.534[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTest load with user_id='AE225O22SA7DLBOGOEIFL7FT5VYQ'[0m


Unnamed: 0,user_id,event_timestamp,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,2021-04-12 10:59:42,2,1.0,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M,B0..."


In [11]:
entity_df

Unnamed: 0,user_id,event_timestamp
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,2021-04-12 10:59:42


## Real load

In [12]:
%%time
features_df = store.get_historical_features(full_df[[args.user_col, args.timestamp_col]].drop_duplicates(), features).to_df()
assert features_df.duplicated().sum() == 0, display(features_df.loc[features_df.duplicated()])



Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.
CPU times: user 75.7 ms, sys: 48.2 ms, total: 124 ms
Wall time: 11.9 s


In [13]:
full_features_df = pd.merge(
    full_df, features_df, on=[args.user_col, args.timestamp_col], how="left"
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,468,394,Video Games,World of Warcraft Battle Chest - (Obsolete),[Experience the World of Warcraft! World of Wa...,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,468,3760,Video Games,World of Warcraft Collector's Edition,"[From the Manufacturer, We're very excited to ...","[Video Games, PC, Games]",999.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",2,5.0,B0006B7DXA
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,468,2164,Video Games,Advance Wars: Dual Strike - Nintendo DS,[Advance Wars: Dual Strike is the latest in th...,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,"B0006B7DXA,B001LETH2Q"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,468,723,Video Games,Super Paper Mario,[The newest chapter of the Paper Mario story i...,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 394...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,468,3011,Video Games,The World Ends With You,"[Product description, Welcome to Shibuya. Wake...","[Video Games, Legacy Systems, Nintendo Systems]",99.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 394.0, 37...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,9989,853,Video Games,Darksiders III - Collector's Edition - Xbox One,[],"[Video Games, Xbox One, Games]",149.99,"[741, 4427, 664, 2712, 2086, 2299, 949, 611, 4...",1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0..."
165256,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,9989,4431,Video Games,Donkey Kong Country: Tropical Freeze - Nintend...,[Barrel-blast into a critically acclaimed Donk...,"[Video Games, Nintendo Switch, Games]",52.49,"[4427, 664, 2712, 2086, 2299, 949, 611, 4352, ...",2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0..."
165257,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,9989,3222,Video Games,Darksiders 2: Deathinitive Edition - Xbox One ...,"[What starts with War, ends in Death. Awakened...","[Video Games, Xbox One, Games]",14.99,"[664, 2712, 2086, 2299, 949, 611, 4352, 1900, ...",3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0..."
165258,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,4179,1645,Video Games,Mudrunner - American Wilds Edition - PlayStati...,[Mud Runner - American Wilds is the ultimate v...,"[Video Games, PlayStation 4, Games]",23.98,"[4317, 2926, 4329, 4188, 1203, 4197, 4447, 154...",1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0..."


In [14]:
logger.info(f"Eye-balling if the features are correct...")
full_features_df.loc[lambda df: df[args.user_col].eq(user_id)].sort_values(
    args.timestamp_col
)[
    [
        args.user_col,
        args.timestamp_col,
        args.item_col,
        "item_sequence",
        "user_rating_list_10_recent_asin",
    ]
]

[32m2024-10-20 00:33:30.069[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mEye-balling if the features are correct...[0m


Unnamed: 0,user_id,timestamp,parent_asin,item_sequence,user_rating_list_10_recent_asin
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,2004-09-14 03:38:36,B0006B7DXA,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,2004-10-13 23:01:27,B001LETH2Q,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",B0006B7DXA
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,2005-08-26 21:05:52,B0009XEC02,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","B0006B7DXA,B001LETH2Q"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,2007-04-13 21:47:03,B000NNDN1M,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 394...","B0006B7DXA,B001LETH2Q,B0009XEC02"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,2008-08-13 18:33:22,B00136MBHA,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 394.0, 37...","B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M"
5,AE225O22SA7DLBOGOEIFL7FT5VYQ,2013-03-17 23:51:07,B007VTVRFA,"[-1.0, -1.0, -1.0, -1.0, -1.0, 394.0, 3760.0, ...","B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M,B0..."
6,AE225O22SA7DLBOGOEIFL7FT5VYQ,2013-06-09 21:07:26,B0053BCML6,"[-1.0, -1.0, -1.0, -1.0, 394.0, 3760.0, 2164.0...","B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M,B0..."


In [15]:
def convert_asin_to_idx(inp: str, sequence_length=10, padding_value=-1):
    if inp is None:
        return [padding_value] * sequence_length
    asins = inp.split(",")
    indices = [idm.get_item_index(item_id) for item_id in asins]
    padding_needed = sequence_length - len(indices)
    output = np.pad(
        indices,
        (padding_needed, 0),  # Add padding at the beginning
        "constant",
        constant_values=padding_value,
    )
    return output

In [16]:
full_features_df = full_features_df.assign(
    item_sequence_fs=lambda df: df["user_rating_list_10_recent_asin"].apply(
        convert_asin_to_idx
    )
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence_fs
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,468,394,Video Games,World of Warcraft Battle Chest - (Obsolete),[Experience the World of Warcraft! World of Wa...,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,468,3760,Video Games,World of Warcraft Collector's Edition,"[From the Manufacturer, We're very excited to ...","[Video Games, PC, Games]",999.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",2,5.0,B0006B7DXA,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 394]"
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,468,2164,Video Games,Advance Wars: Dual Strike - Nintendo DS,[Advance Wars: Dual Strike is the latest in th...,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,"B0006B7DXA,B001LETH2Q","[-1, -1, -1, -1, -1, -1, -1, -1, 394, 3760]"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,468,723,Video Games,Super Paper Mario,[The newest chapter of the Paper Mario story i...,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 394...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02","[-1, -1, -1, -1, -1, -1, -1, 394, 3760, 2164]"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,468,3011,Video Games,The World Ends With You,"[Product description, Welcome to Shibuya. Wake...","[Video Games, Legacy Systems, Nintendo Systems]",99.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 394.0, 37...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M","[-1, -1, -1, -1, -1, -1, 394, 3760, 2164, 723]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,9989,853,Video Games,Darksiders III - Collector's Edition - Xbox One,[],"[Video Games, Xbox One, Games]",149.99,"[741, 4427, 664, 2712, 2086, 2299, 949, 611, 4...",1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0...","[741, 4427, 664, 2712, 2086, 2299, 949, 611, 4..."
165256,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,9989,4431,Video Games,Donkey Kong Country: Tropical Freeze - Nintend...,[Barrel-blast into a critically acclaimed Donk...,"[Video Games, Nintendo Switch, Games]",52.49,"[4427, 664, 2712, 2086, 2299, 949, 611, 4352, ...",2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0...","[4427, 664, 2712, 2086, 2299, 949, 611, 4352, ..."
165257,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,9989,3222,Video Games,Darksiders 2: Deathinitive Edition - Xbox One ...,"[What starts with War, ends in Death. Awakened...","[Video Games, Xbox One, Games]",14.99,"[664, 2712, 2086, 2299, 949, 611, 4352, 1900, ...",3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0...","[664, 2712, 2086, 2299, 949, 611, 4352, 1900, ..."
165258,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,4179,1645,Video Games,Mudrunner - American Wilds Edition - PlayStati...,[Mud Runner - American Wilds is the ultimate v...,"[Video Games, PlayStation 4, Games]",23.98,"[4317, 2926, 4329, 4188, 1203, 4197, 4447, 154...",1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0...","[4317, 2926, 4329, 4188, 1203, 4197, 4447, 154..."


In [17]:
# There are cases that item_sequence transformed by Python is different than implemented SQL due to a bit of randomness when it comes to duplicated timestamp sampling
# Anyway we can ignore them if there are not many for now, the effort invested to solve this problem is deemed unworthy at the moment.
MISMATCH_THRESHOLD = 0.01


def check_item_sequence_wrong():
    return full_features_df["item_sequence"].apply(sum) != full_features_df[
        "item_sequence_fs"
    ].apply(sum)


assert (
    check_item_sequence_wrong().sum() <= len(full_features_df) * MISMATCH_THRESHOLD
), display(full_features_df.loc[check_item_sequence_wrong()].sample(1))

In [18]:
# This snipppet examines the cases where there are mismatch (look for match column = False)
if check_item_sequence_wrong().sum() > 0:
    user_id = full_features_df.loc[check_item_sequence_wrong()].sample(1)[args.user_col]
    with pd.option_context("display.max_colwidth", None):
        display(
            full_features_df.loc[lambda df: df[args.user_col].eq(user_id)]
            .sort_values(args.timestamp_col)[
                [
                    args.timestamp_col,
                    args.item_col,
                    "item_indice",
                    "item_sequence",
                    "item_sequence_fs",
                ]
            ]
            .assign(
                match=lambda df: df["item_sequence"].apply(sum)
                == df["item_sequence_fs"].apply(sum)
            )
        )

Unnamed: 0,timestamp,parent_asin,item_indice,item_sequence,item_sequence_fs,match
16804,2017-04-11 17:47:02,B00DJRLDMU,2839,"[3091.0, 2339.0, 2960.0, 3250.0, 81.0, 3104.0, 2797.0, 2442.0, 3826.0, 2607.0]","[591, 2339, 2960, 3250, 81, 3104, 2797, 2442, 3826, 2607]",False


In [19]:
# Drop the Python impl. of item_sequence to use Feature Store instead
full_features_df = full_features_df.drop(columns=["item_sequence"]).rename(
    columns={"item_sequence_fs": "item_sequence"}
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,468,394,Video Games,World of Warcraft Battle Chest - (Obsolete),[Experience the World of Warcraft! World of Wa...,"[Video Games, PC, Games]",,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,468,3760,Video Games,World of Warcraft Collector's Edition,"[From the Manufacturer, We're very excited to ...","[Video Games, PC, Games]",999.99,2,5.0,B0006B7DXA,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 394]"
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,468,2164,Video Games,Advance Wars: Dual Strike - Nintendo DS,[Advance Wars: Dual Strike is the latest in th...,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,1,,"B0006B7DXA,B001LETH2Q","[-1, -1, -1, -1, -1, -1, -1, -1, 394, 3760]"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,468,723,Video Games,Super Paper Mario,[The newest chapter of the Paper Mario story i...,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,1,,"B0006B7DXA,B001LETH2Q,B0009XEC02","[-1, -1, -1, -1, -1, -1, -1, 394, 3760, 2164]"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,468,3011,Video Games,The World Ends With You,"[Product description, Welcome to Shibuya. Wake...","[Video Games, Legacy Systems, Nintendo Systems]",99.99,1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M","[-1, -1, -1, -1, -1, -1, 394, 3760, 2164, 723]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,9989,853,Video Games,Darksiders III - Collector's Edition - Xbox One,[],"[Video Games, Xbox One, Games]",149.99,1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0...","[741, 4427, 664, 2712, 2086, 2299, 949, 611, 4..."
165256,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,9989,4431,Video Games,Donkey Kong Country: Tropical Freeze - Nintend...,[Barrel-blast into a critically acclaimed Donk...,"[Video Games, Nintendo Switch, Games]",52.49,2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0...","[4427, 664, 2712, 2086, 2299, 949, 611, 4352, ..."
165257,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,9989,3222,Video Games,Darksiders 2: Deathinitive Edition - Xbox One ...,"[What starts with War, ends in Death. Awakened...","[Video Games, Xbox One, Games]",14.99,3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0...","[664, 2712, 2086, 2299, 949, 611, 4352, 1900, ..."
165258,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,4179,1645,Video Games,Mudrunner - American Wilds Edition - PlayStati...,[Mud Runner - American Wilds is the ultimate v...,"[Video Games, PlayStation 4, Games]",23.98,1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0...","[4317, 2926, 4329, 4188, 1203, 4197, 4447, 154..."


# Generate negative samples

In [20]:
features = [
    "item_sequence",
    "user_id",
    "user_rating_cnt_90d",
    "user_rating_avg_prev_rating_90d",
    "user_rating_list_10_recent_asin",
]

neg_df = generate_negative_samples(
    full_features_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    neg_label=0,
    seed=args.random_seed,
)
neg_ts_df = add_features_to_neg_df(
    full_features_df, neg_df, "user_indice", args.timestamp_col, features
)
neg_ts_df

Generating Negative Samples:   0%|          | 0/19578 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin
0,0,1088,0,2010-10-07 09:52:14.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",AEZW74DBPW64H5IYA22DX7U7YRIQ,1,,
1,0,176,0,2016-05-18 01:03:39.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2410]",AEZW74DBPW64H5IYA22DX7U7YRIQ,1,,B001QCWRZC
2,0,3117,0,2016-07-24 13:50:13.000,"[-1, -1, -1, -1, -1, -1, -1, -1, 2410, 3549]",AEZW74DBPW64H5IYA22DX7U7YRIQ,2,5.0,"B001QCWRZC,B00O65I2VY"
3,0,165,0,2017-05-12 18:25:01.000,"[-1, -1, -1, -1, -1, -1, -1, 2410, 3549, 4622]",AEZW74DBPW64H5IYA22DX7U7YRIQ,1,,"B001QCWRZC,B00O65I2VY,B017W1771Y"
4,0,467,0,2017-07-29 02:24:43.038,"[-1, -1, -1, -1, -1, -1, 2410, 3549, 4622, 3887]",AEZW74DBPW64H5IYA22DX7U7YRIQ,2,5.0,"B001QCWRZC,B00O65I2VY,B017W1771Y,B01J4K0IW6"
...,...,...,...,...,...,...,...,...,...
165255,19577,4562,0,2003-03-20 20:35:05.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",AFUWCAWB4SBVQWQ2MPVKVPPNHGBA,1,,
165256,19577,3342,0,2003-03-30 21:53:59.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 716]",AFUWCAWB4SBVQWQ2MPVKVPPNHGBA,2,5.0,B000035Y6N
165257,19577,4163,0,2003-04-06 18:55:50.000,"[-1, -1, -1, -1, -1, -1, -1, -1, 716, 4137]",AFUWCAWB4SBVQWQ2MPVKVPPNHGBA,3,5.0,"B000035Y6N,B00005Q8LY"
165258,19577,1588,0,2003-04-27 23:33:03.000,"[-1, -1, -1, -1, -1, -1, -1, 716, 4137, 673]",AFUWCAWB4SBVQWQ2MPVKVPPNHGBA,4,5.0,"B000035Y6N,B00005Q8LY,B00000JRSB"


# Join with features

Populate the feature values for the newly created negative samples

In [21]:
# Item features

not_item_feature_cols = (
    args.user_col,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    *features,
)
item_features = [
    col for col in full_features_df.columns if col not in not_item_feature_cols
]
item_features

['parent_asin', 'main_category', 'title', 'description', 'categories', 'price']

In [22]:
neg_ts_df = pd.merge(
    neg_ts_df,
    full_features_df[["item_indice", *item_features]].drop_duplicates(
        subset=["item_indice"]
    ),
    how="left",
    on=["item_indice"],
    validate="m:1",
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,parent_asin,main_category,title,description,categories,price
0,0,1088,0,2010-10-07 09:52:14.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",AEZW74DBPW64H5IYA22DX7U7YRIQ,1,,,B001EYUQVE,,Call of Duty: World at War Final Fronts - Play...,"[Product Description, Developed exclusively fo...","[Video Games, Legacy Systems, PlayStation Syst...",26.86
1,0,176,0,2016-05-18 01:03:39.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2410]",AEZW74DBPW64H5IYA22DX7U7YRIQ,1,,B001QCWRZC,B00Z9TMH1W,Video Games,Madden NFL 17 - Deluxe Edition - Xbox One,[30 Ranked Tickets/10 All Pro Packs. Delivery ...,"[Video Games, Xbox One]",33.98
2,0,3117,0,2016-07-24 13:50:13.000,"[-1, -1, -1, -1, -1, -1, -1, -1, 2410, 3549]",AEZW74DBPW64H5IYA22DX7U7YRIQ,2,5.0,"B001QCWRZC,B00O65I2VY",B00K0NV5J2,Video Games,Dragon Age: Inquisition - Jaws Of Hakkon (Cros...,"[The first story-based expansion to, Dragon Ag...","[Video Games, Game Genre of the Month]",
3,0,165,0,2017-05-12 18:25:01.000,"[-1, -1, -1, -1, -1, -1, -1, 2410, 3549, 4622]",AEZW74DBPW64H5IYA22DX7U7YRIQ,1,,"B001QCWRZC,B00O65I2VY,B017W1771Y",B00F27JGVA,Computers,PlayStation Vita Memory Card 64GB (PCH-Z641J),[This is a memory card for Play Station Vita t...,"[Video Games, Legacy Systems, PlayStation Syst...",251.7
4,0,467,0,2017-07-29 02:24:43.038,"[-1, -1, -1, -1, -1, -1, 2410, 3549, 4622, 3887]",AEZW74DBPW64H5IYA22DX7U7YRIQ,2,5.0,"B001QCWRZC,B00O65I2VY,B017W1771Y,B01J4K0IW6",B00Z9TM72Q,Video Games,Call of Duty: Infinite Warfare - PS4 Legacy Ed...,"[*The DLC (Downloadable Content), Trials/Subsc...","[Video Games, PlayStation 4, Games]",23.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,19577,4562,0,2003-03-20 20:35:05.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",AFUWCAWB4SBVQWQ2MPVKVPPNHGBA,1,,,B016XBGWAQ,Video Games,Steam Link,[Ooma Butterfleye smart Security Camera with b...,"[Video Games, PC]",145.0
165256,19577,3342,0,2003-03-30 21:53:59.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 716]",AFUWCAWB4SBVQWQ2MPVKVPPNHGBA,2,5.0,B000035Y6N,B00HVBPRUO,Video Games,Gold Wireless Stereo Headset - PlayStation 4,[A Headset for Gamers: Experience everything f...,"[Video Games, PlayStation 4, Accessories, Head...",
165257,19577,4163,0,2003-04-06 18:55:50.000,"[-1, -1, -1, -1, -1, -1, -1, -1, 716, 4137]",AFUWCAWB4SBVQWQ2MPVKVPPNHGBA,3,5.0,"B000035Y6N,B00005Q8LY",B087NNPYP3,Video Games,The Legend of Zelda: Breath of the Wild Master...,"[Woken from a 100-year slumber, Link must expl...","[Video Games, Nintendo Switch, Consoles]",
165258,19577,1588,0,2003-04-27 23:33:03.000,"[-1, -1, -1, -1, -1, -1, -1, 716, 4137, 673]",AFUWCAWB4SBVQWQ2MPVKVPPNHGBA,4,5.0,"B000035Y6N,B00005Q8LY,B00000JRSB",B004Q8L46G,Video Games,Dirt 3 - Playstation 3,"[Product Description, Dirt 3 (PS3), Amazon.com...","[Video Games, Legacy Systems, PlayStation Syst...",47.53


In [23]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]["item_indice"]
assert (
    len(set(neg_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_features_df.loc[lambda df: df[args.item_col].eq(item)][
    "item_indice"
]
assert (
    len(set(original_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2024-10-20 00:33:43.589[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B0B99ZJCNC...[0m


# Concating positive data with negative samples

In [24]:
full_features_df = pd.concat([full_features_df, neg_ts_df], axis=0).sample(
    frac=1, replace=False, random_state=args.random_seed
)

In [25]:
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
86292,AFDBCNCWNMRNRTMZLCJYAH25ZI2A,B08N7QBVBJ,0.0,2010-12-14 18:13:27.000,10157,681,Video Games,PowerA Charging Stand for Xbox One - White,[An Xbox wireless Controller looks great on th...,"[Video Games, Xbox One, Accessories, Batteries...",13.57,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
159781,AERBGYRVU2NO24B5CNSMGDSCLD3Q,B004JLO65Q,0.0,2016-11-26 05:51:29.000,18922,3515,Video Games,Nintendo Official Executive Case for 3DS,[Carry your Nintendo 3DS system in style with ...,"[Video Games, Legacy Systems, Nintendo Systems...",,1,,"B0044R8X9U,B000P297JS","[-1, -1, -1, -1, -1, -1, -1, -1, 4206, 4417]"
127826,AEXNTHZMDXE4GL2ZKHLMOZWEHUNA,B01B62OSTE,0.0,2018-10-17 22:02:48.892,15120,3587,Video Games,Turtle Beach - Ear Force Elite 800 - Premium F...,[Turtle Beach’s Elite 800 isn’t your ordinary ...,"[Video Games, PlayStation 4, Accessories, Head...",123.24,1,,"B0036F0V4G,B003HGGN82,B0088MVPFQ,B007CM0K86,B0...","[-1, 921, 1011, 4330, 2417, 3362, 1859, 1614, ..."
52027,AFC5XTCF5D7J3NSDITB2Z26XWWYA,B001E8WQUY,5.0,2019-05-01 21:22:39.265,4316,229,Video Games,Rock Band 2 - Nintendo Wii (Game only),"[Product description, Rock Band 2 lets you and...","[Video Games, Legacy Systems, Nintendo Systems...",28.49,1,,"B006HZA6VK,B0BN2FNKLM,B0086VPUHI,B0040UAYI4,B0...","[550, 3643, 464, 400, 1177, 997, 4585, 440, 90..."
48913,AF7LJQOIWF3Y3YD7SGOJ34MA5JPA,B001E8WQKY,5.0,2015-01-09 12:53:25.000,13887,2028,Video Games,Resident Evil 5 - Xbox 360,[],"[Video Games, Legacy Systems, Xbox Systems, Xb...",29.88,3,5.0,"B00A2ML6XG,B003VUO6LU","[-1, -1, -1, -1, -1, -1, -1, -1, 1378, 303]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85700,AF2UEE65LG6WPCRSIP3UBGZLN7EQ,B00XBLQCLQ,0.0,2012-10-01 06:46:28.000,10087,3752,Video Games,Assassin’s Creed Syndicate - Gold Edition | PC...,"[London, 1868. The Industrial Revolution unlea...","[Video Games, PC, Games]",66.35,2,3.0,"B001EYUQVE,B001ELJFGO","[-1, -1, -1, -1, -1, -1, -1, -1, 1088, 309]"
51798,AH2RSPTE3H6XPONAC7XHIXFHE4IA,B002BSA298,0.0,2017-10-20 12:55:58.546,6088,2775,Video Games,Kinect Sensor with Kinect Adventures!,"[Product Description, Kinect for Xbox 360 brin...","[Video Games, Legacy Systems, Xbox Systems, Xb...",88.0,2,5.0,"B0118GJKIW,B00X3EDHZU,B004HO6CQG,B073CFJG46","[-1, -1, -1, -1, -1, -1, 3679, 3668, 2576, 3818]"
158208,AFKEXMJWTFZMBO7QF6OFI4AD2B5A,B087LSSNG1,0.0,2020-02-16 21:01:48.099,18733,611,Video Games,Xenoblade Chronicles: Definitive Edition - Nin...,[Discover the origins of Shulk as he and his c...,"[Video Games, Nintendo Switch, Games]",54.98,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
132003,AHAVA5VKMJ3OMOLGDZ3W45CKXEWA,B00KTORA0K,5.0,2019-05-25 04:03:51.505,17876,1189,Video Games,Just Dance 2015 - Wii,[With more than 50 million copies of Just Danc...,"[Video Games, Legacy Systems, Nintendo Systems...",33.0,2,5.0,"B004AYCNR0,B007NUQICE,B000TYQL1O,B000SEU92W,B0...","[-1, -1, -1, 2229, 4467, 4566, 262, 3157, 1682..."


In [26]:
key_cols = [
    args.user_col,
    args.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    args.rating_col,
    args.timestamp_col,
]
assert (
    full_features_df[key_cols].isna().sum().sum() == 0
), "Null values found at key colums"

In [27]:
val_timestamp

1628630904765

# Split back train test

In [28]:
to_unix_ts = lambda s: s.astype("int64") // 10**6
train_neg_df = full_features_df.loc[
    lambda df: df[args.timestamp_col].pipe(to_unix_ts).lt(val_timestamp)
]
val_neg_df = full_features_df.loc[
    lambda df: df[args.timestamp_col].pipe(to_unix_ts).ge(val_timestamp)
]

In [29]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
86292,AFDBCNCWNMRNRTMZLCJYAH25ZI2A,B08N7QBVBJ,0.0,2010-12-14 18:13:27.000,10157,681,Video Games,PowerA Charging Stand for Xbox One - White,[An Xbox wireless Controller looks great on th...,"[Video Games, Xbox One, Accessories, Batteries...",13.57,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
159781,AERBGYRVU2NO24B5CNSMGDSCLD3Q,B004JLO65Q,0.0,2016-11-26 05:51:29.000,18922,3515,Video Games,Nintendo Official Executive Case for 3DS,[Carry your Nintendo 3DS system in style with ...,"[Video Games, Legacy Systems, Nintendo Systems...",,1,,"B0044R8X9U,B000P297JS","[-1, -1, -1, -1, -1, -1, -1, -1, 4206, 4417]"
127826,AEXNTHZMDXE4GL2ZKHLMOZWEHUNA,B01B62OSTE,0.0,2018-10-17 22:02:48.892,15120,3587,Video Games,Turtle Beach - Ear Force Elite 800 - Premium F...,[Turtle Beach’s Elite 800 isn’t your ordinary ...,"[Video Games, PlayStation 4, Accessories, Head...",123.24,1,,"B0036F0V4G,B003HGGN82,B0088MVPFQ,B007CM0K86,B0...","[-1, 921, 1011, 4330, 2417, 3362, 1859, 1614, ..."
52027,AFC5XTCF5D7J3NSDITB2Z26XWWYA,B001E8WQUY,5.0,2019-05-01 21:22:39.265,4316,229,Video Games,Rock Band 2 - Nintendo Wii (Game only),"[Product description, Rock Band 2 lets you and...","[Video Games, Legacy Systems, Nintendo Systems...",28.49,1,,"B006HZA6VK,B0BN2FNKLM,B0086VPUHI,B0040UAYI4,B0...","[550, 3643, 464, 400, 1177, 997, 4585, 440, 90..."
48913,AF7LJQOIWF3Y3YD7SGOJ34MA5JPA,B001E8WQKY,5.0,2015-01-09 12:53:25.000,13887,2028,Video Games,Resident Evil 5 - Xbox 360,[],"[Video Games, Legacy Systems, Xbox Systems, Xb...",29.88,3,5.0,"B00A2ML6XG,B003VUO6LU","[-1, -1, -1, -1, -1, -1, -1, -1, 1378, 303]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85700,AF2UEE65LG6WPCRSIP3UBGZLN7EQ,B00XBLQCLQ,0.0,2012-10-01 06:46:28.000,10087,3752,Video Games,Assassin’s Creed Syndicate - Gold Edition | PC...,"[London, 1868. The Industrial Revolution unlea...","[Video Games, PC, Games]",66.35,2,3.0,"B001EYUQVE,B001ELJFGO","[-1, -1, -1, -1, -1, -1, -1, -1, 1088, 309]"
51798,AH2RSPTE3H6XPONAC7XHIXFHE4IA,B002BSA298,0.0,2017-10-20 12:55:58.546,6088,2775,Video Games,Kinect Sensor with Kinect Adventures!,"[Product Description, Kinect for Xbox 360 brin...","[Video Games, Legacy Systems, Xbox Systems, Xb...",88.0,2,5.0,"B0118GJKIW,B00X3EDHZU,B004HO6CQG,B073CFJG46","[-1, -1, -1, -1, -1, -1, 3679, 3668, 2576, 3818]"
158208,AFKEXMJWTFZMBO7QF6OFI4AD2B5A,B087LSSNG1,0.0,2020-02-16 21:01:48.099,18733,611,Video Games,Xenoblade Chronicles: Definitive Edition - Nin...,[Discover the origins of Shulk as he and his c...,"[Video Games, Nintendo Switch, Games]",54.98,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
132003,AHAVA5VKMJ3OMOLGDZ3W45CKXEWA,B00KTORA0K,5.0,2019-05-25 04:03:51.505,17876,1189,Video Games,Just Dance 2015 - Wii,[With more than 50 million copies of Just Danc...,"[Video Games, Legacy Systems, Nintendo Systems...",33.0,2,5.0,"B004AYCNR0,B007NUQICE,B000TYQL1O,B000SEU92W,B0...","[-1, -1, -1, 2229, 4467, 4566, 262, 3157, 1682..."


In [30]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
102273,AGOQLXJZU2MI36EGHFLLSAOP76AQ,B001ELJE7E,0.0,2022-01-10 17:06:18.183,12077,3580,Video Games,Half-Life 2: Game of the Year Edition - PC,[Half-Life 2: Game of the Year Edition compile...,"[Video Games, PC, Games]",77.99,1,,"B0753D38RB,B0B8RG61GK,B07V8YSBFG,B01N3ASPNV,B0...","[-1, -1, -1, -1, 4321, 3836, 3545, 3237, 1875,..."
165059,AHAQV7A4Z2Z5NJLIPMEPPXDNIRWQ,B07VHHPJBV,2.0,2022-04-10 16:30:15.820,3479,200,Computers,"Redragon M602 Griffin RGB Gaming Mouse, RGB Sp...",[Redragon M602 (White) GRIFFIN High-Precision ...,"[Video Games, PC, Accessories, Gaming Mice]",19.99,1,,"B00AEWTSWU,B003ELPQJM,B0111ZC4OC,B00E4MQODC,B0...","[-1, -1, -1, -1, -1, 3100, 4613, 2281, 1597, 3..."
164871,AGHUHJSJLGWPS3JE2FD7D5GJODWQ,B00DBDPOZ4,5.0,2022-01-16 16:27:51.164,10384,276,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,1,,"B00006IR62,B0B4CRTWGM,B000TYQL1O,9629971372,B0...","[-1, -1, -1, 4345, 70, 4566, 1501, 2224, 782, ..."
87436,AGZXEYY46TLSAJSH3CU427CWDZLQ,B001EYUO4I,0.0,2022-02-08 20:12:20.651,10293,3209,Video Games,The Sims 2: Apartment Life Expansion Pack,"[Product Description, Your Sims are moving int...","[Video Games, PC, Games]",53.88,1,,"B07YBXFDYN,B00I9TNNE4,B07CTHLS2L,B07HYY6HNW,B0...","[-1, -1, -1, -1, 4611, 2751, 1126, 1360, 1822,..."
41370,AHMTAKZP4ZMEMMHXPZO3IX6HESCA,B0771ZXXV6,0.0,2021-12-04 22:39:03.674,4891,2163,Video Games,Nintendo Joy-Con (R) - Neon Red - Nintendo Switch,[To be determined],"[Video Games, Nintendo Switch, Accessories, Co...",,2,3.0,"B006VO0QUI,B000BKSGH8,B00QF9A0SC,B08Y8V7QKM,B0...","[-1, 4324, 2425, 753, 2066, 2000, 3247, 3441, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164950,AGRGN2RA4EOW3T4GFI4H76WHUGSA,B07JK9DFKH,5.0,2022-02-12 04:30:57.128,8217,4457,Cell Phones & Accessories,REAKA Upgraded USB Type C to HDMI Digital AV M...,[],"[Video Games, Nintendo Switch, Accessories, Ca...",22.99,1,,"B00PDRZG9U,B0728CRKP2,B073SQKLGW,B01K1OO5PU,B0...","[198, 4263, 1488, 2438, 2092, 753, 3075, 4482,..."
164643,AFL2OB53OGEIQCA4CMFCTQV3FJJA,B00BCX2AZW,5.0,2022-01-13 15:10:43.458,8196,1857,Video Games,Monster Hunter 3 Ultimate - Nintendo Wii U,[The popular fantasy action franchise Monster ...,"[Video Games, Legacy Systems, Nintendo Systems...",54.35,1,,"B08CL92HPG,B01N2V26CN,B002I0K3CK,B00DC7G2W8,B0...","[-1, -1, -1, -1, -1, 2289, 733, 1022, 1207, 4288]"
46154,AE4XPHPRPYAX4WQ7MU5ILOPO36UQ,B00F4CDIR6,0.0,2022-04-12 17:40:37.853,5435,2228,Video Games,Xbox $5 Gift Card - Xbox 360 Digital Code,"[With an Xbox LIVE Gold membership, take your ...","[Video Games, Online Game Services, Xbox Live,...",,1,,"B000095ZH5,B00JL6N3EO,B00NEU02JW,B01M6CV5IF,B0...","[-1, -1, -1, 3822, 3252, 2325, 2349, 2343, 958..."
2337,AHNBPSRKOZIXT7WBRTJXTFQW4NXQ,B07RQP5S8C,0.0,2021-10-27 17:00:49.666,285,2294,Cell Phones & Accessories,"Lamicall Adjustable Phone Tablet Stand, Playst...",[],"[Video Games, Nintendo Switch, Accessories, Mo...",15.99,1,,"B07L3D7C21,B08D3XL1KF,B07HHW8C4V,B07V5CFMY4,B0...","[4089, 861, 2908, 679, 134, 1462, 3827, 1846, ..."


# Checks

In [31]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
assert (
    check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2
), "Number of pos and neg samples are not equal"

[32m2024-10-20 00:33:47.742[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AGHZW6SSXNPOIZ77QGRXILPT6IJQ...[0m


In [32]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert (
    train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5
), f"Item {item} does not appear much in training data"

[32m2024-10-20 00:33:48.856[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B0039O6KZS...[0m


In [39]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
100114,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B07TL6L7MY,5.0,2020-08-27 20:42:05.426,12899,1353,All Electronics,HD Retrovision Wii/Wii U Premium YPbPr Compone...,[Use these high-quality cables to connect your...,"[Video Games, Legacy Systems, Nintendo Systems...",29.99,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
108904,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B002JTX9XA,0.0,2020-08-27 20:42:05.426,12899,2969,Video Games,Teenage Mutant Ninja Turtles: Smash Up,"[Product description, Features include: •Fight...","[Video Games, Legacy Systems, Nintendo Systems...",30.0,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
100115,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B07H3F94ZN,5.0,2020-08-27 20:43:27.335,12899,4432,All Electronics,HD Retrovision PlayStation 2/3 (PS2/PS3) Premi...,[Use these high-quality cables to connect your...,"[Video Games, Legacy Systems, PlayStation Syst...",29.99,2,5.0,B07TL6L7MY,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1353]"
108905,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B00DV16N6U,0.0,2020-08-27 20:43:27.335,12899,3617,Video Games,Ratchet and Clank: Into the Nexus - PS3,"[Product Description, Ratchet and Clank are ba...","[Video Games, Legacy Systems, PlayStation Syst...",38.88,2,5.0,B07TL6L7MY,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1353]"
108906,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B003JVKHEQ,0.0,2021-02-26 03:52:32.431,12899,444,Video Games,Call of Duty: Black Ops - Xbox 360,"[Product Description, The newest installment i...","[Video Games, Legacy Systems, Xbox Systems, Xb...",37.89,1,,"B07TL6L7MY,B07H3F94ZN","[-1, -1, -1, -1, -1, -1, -1, -1, 1353, 4432]"
100116,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B09ZTV42CQ,5.0,2021-02-26 03:52:32.431,12899,1485,Video Games,8Bitdo Adapter 2 USB Wireless Switch Controlle...,[],"[Video Games, Nintendo Switch]",19.99,1,,"B07TL6L7MY,B07H3F94ZN","[-1, -1, -1, -1, -1, -1, -1, -1, 1353, 4432]"
108907,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B001EYUY7U,0.0,2021-02-26 03:53:51.600,12899,517,Video Games,Sonic Unleashed - Playstation 3,[Sonic the Hedgehog is on an adventure unlike ...,"[Video Games, Legacy Systems, PlayStation Syst...",15.74,2,5.0,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ","[-1, -1, -1, -1, -1, -1, -1, 1353, 4432, 1485]"
100117,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B0B9MJK753,5.0,2021-02-26 03:53:51.600,12899,3015,Computers,8Bitdo Wireless USB Adapter 2 for Switch/Switc...,[],"[Video Games, Nintendo Switch, Accessories, Ca...",19.99,2,5.0,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ","[-1, -1, -1, -1, -1, -1, -1, 1353, 4432, 1485]"
100118,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B073XKXQW4,5.0,2021-02-26 04:07:50.271,12899,2344,Computers,BASSTOP Portable DIY Replacement Housing Shell...,[Specially designed case to DIY your personali...,"[Video Games, Nintendo Switch, Accessories, Ca...",20.99,3,5.0,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ,B0B9MJK753","[-1, -1, -1, -1, -1, -1, 1353, 4432, 1485, 3015]"
108908,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B004GINKNQ,0.0,2021-02-26 04:07:50.271,12899,1213,Home Audio & Theater,60GB Hard Disk Drive for Xbox 360 Slim,[Enhance your experience on your new Xbox 360 ...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",,3,5.0,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ,B0B9MJK753","[-1, -1, -1, -1, -1, -1, 1353, 4432, 1485, 3015]"


## Random eye-ball

In [33]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
100114,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B07TL6L7MY,5.0,2020-08-27 20:42:05.426,12899,1353,All Electronics,HD Retrovision Wii/Wii U Premium YPbPr Compone...,[Use these high-quality cables to connect your...,"[Video Games, Legacy Systems, Nintendo Systems...",29.99,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
108904,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B002JTX9XA,0.0,2020-08-27 20:42:05.426,12899,2969,Video Games,Teenage Mutant Ninja Turtles: Smash Up,"[Product description, Features include: •Fight...","[Video Games, Legacy Systems, Nintendo Systems...",30.0,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
100115,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B07H3F94ZN,5.0,2020-08-27 20:43:27.335,12899,4432,All Electronics,HD Retrovision PlayStation 2/3 (PS2/PS3) Premi...,[Use these high-quality cables to connect your...,"[Video Games, Legacy Systems, PlayStation Syst...",29.99,2,5.0,B07TL6L7MY,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1353]"
108905,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B00DV16N6U,0.0,2020-08-27 20:43:27.335,12899,3617,Video Games,Ratchet and Clank: Into the Nexus - PS3,"[Product Description, Ratchet and Clank are ba...","[Video Games, Legacy Systems, PlayStation Syst...",38.88,2,5.0,B07TL6L7MY,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1353]"
108906,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B003JVKHEQ,0.0,2021-02-26 03:52:32.431,12899,444,Video Games,Call of Duty: Black Ops - Xbox 360,"[Product Description, The newest installment i...","[Video Games, Legacy Systems, Xbox Systems, Xb...",37.89,1,,"B07TL6L7MY,B07H3F94ZN","[-1, -1, -1, -1, -1, -1, -1, -1, 1353, 4432]"
100116,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B09ZTV42CQ,5.0,2021-02-26 03:52:32.431,12899,1485,Video Games,8Bitdo Adapter 2 USB Wireless Switch Controlle...,[],"[Video Games, Nintendo Switch]",19.99,1,,"B07TL6L7MY,B07H3F94ZN","[-1, -1, -1, -1, -1, -1, -1, -1, 1353, 4432]"
108907,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B001EYUY7U,0.0,2021-02-26 03:53:51.600,12899,517,Video Games,Sonic Unleashed - Playstation 3,[Sonic the Hedgehog is on an adventure unlike ...,"[Video Games, Legacy Systems, PlayStation Syst...",15.74,2,5.0,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ","[-1, -1, -1, -1, -1, -1, -1, 1353, 4432, 1485]"
100117,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B0B9MJK753,5.0,2021-02-26 03:53:51.600,12899,3015,Computers,8Bitdo Wireless USB Adapter 2 for Switch/Switc...,[],"[Video Games, Nintendo Switch, Accessories, Ca...",19.99,2,5.0,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ","[-1, -1, -1, -1, -1, -1, -1, 1353, 4432, 1485]"
100118,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B073XKXQW4,5.0,2021-02-26 04:07:50.271,12899,2344,Computers,BASSTOP Portable DIY Replacement Housing Shell...,[Specially designed case to DIY your personali...,"[Video Games, Nintendo Switch, Accessories, Ca...",20.99,3,5.0,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ,B0B9MJK753","[-1, -1, -1, -1, -1, -1, 1353, 4432, 1485, 3015]"
108908,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B004GINKNQ,0.0,2021-02-26 04:07:50.271,12899,1213,Home Audio & Theater,60GB Hard Disk Drive for Xbox 360 Slim,[Enhance your experience on your new Xbox 360 ...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",,3,5.0,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ,B0B9MJK753","[-1, -1, -1, -1, -1, -1, 1353, 4432, 1485, 3015]"


In [34]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
164873,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B0039O6KZS,5.0,2022-06-16 05:57:45.488,12899,1811,Computers,GGG0024 Black Xbox 360 Replacement Thumbsticks...,"[After using a controller for a while, one of ...","[Video Games, Legacy Systems, Xbox Systems, Xb...",11.49,1,,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ,B0B9MJK753,B0...","[-1, -1, -1, -1, -1, 1353, 4432, 1485, 3015, 2..."
108909,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B002JTX610,0.0,2022-06-16 05:57:45.488,12899,533,Video Games,Dante's Inferno - Sony PSP,"[Product Description, Based on the immensely i...","[Video Games, Legacy Systems, PlayStation Syst...",35.15,1,,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ,B0B9MJK753,B0...","[-1, -1, -1, -1, -1, 1353, 4432, 1485, 3015, 2..."
108910,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B0083GAF12,0.0,2022-06-16 06:48:09.875,12899,2711,Video Games,FIFA Soccer 13 [Download],"[Amazon.com, FIFA 13, is the 2012 release in E...","[Video Games, PC]",,2,5.0,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ,B0B9MJK753,B0...","[-1, -1, -1, -1, 1353, 4432, 1485, 3015, 2344,..."
164874,AGHZW6SSXNPOIZ77QGRXILPT6IJQ,B005FMLZQQ,5.0,2022-06-16 06:48:09.875,12899,1918,Video Games,Xbox 360 Media Remote,"[Product Description, Control your console ent...","[Video Games, Legacy Systems, Xbox Systems, Xb...",,2,5.0,"B07TL6L7MY,B07H3F94ZN,B09ZTV42CQ,B0B9MJK753,B0...","[-1, -1, -1, -1, 1353, 4432, 1485, 3015, 2344,..."


# Persist

In [37]:
full_features_df.to_parquet(
    "../data/full_features_neg_sampling_df.parquet", index=False
)
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)