# Negative sampling

More label data can tremendously help model to learn. In this notebook we would prepare negative samples for each user based on the unseen items. The unseen items are sampled based on how frequent they appear in the training dataset. This is an intentional choice to make the negative sample harder, hence potentially more useful. This would also force the model learn relevant patterns about user behaviors rather than biased by popularity.

# Set up

In [1]:
import os
import sys
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.id_mapper import IDMapper
from src.ranker.negative_sampling import generate_negative_samples

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-sequence-modeling"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    neg_to_pos_ratio: int = 1

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-sequence-modeling",
  "notebook_persist_dp": "/Users/dvq/frostmourne/recsys-mvp/notebooks/data/000-sequence-modeling",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "neg_to_pos_ratio": 1
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

In [4]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df, neg_to_pos_ratio=5)

  0%|          | 0/7 [00:00<?, ?it/s]

In [6]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
0,1,105,0,1
0,1,106,0,1
1,1,104,0,2
1,1,106,0,2
1,1,105,0,2
2,1,106,0,4
2,1,104,0,4
2,1,105,0,4
3,2,103,0,1


# Load data

In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")
idm = IDMapper().load("../data/idm.json")

In [8]:
assert val_df[args.timestamp_col].min() > train_df[args.timestamp_col].max()
val_timestamp = train_df[args.timestamp_col].max() + timedelta(seconds=1)
logger.info(f"{val_timestamp=}")

[32m2024-10-27 23:21:54.817[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=Timestamp('2021-08-10 21:28:25.764000')[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,1,5.00,1,5.0,0,,0,,0,352,Video Games,World of Warcraft Battle Chest - (Obsolete),[Experience the World of Warcraft! World of Wa...,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,0,,0,,0,,0,,0,1110,Video Games,World of Warcraft Collector's Edition,"[From the Manufacturer, We're very excited to ...","[Video Games, PC, Games]",999.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,0,,0,,0,,0,,0,387,Video Games,Advance Wars: Dual Strike - Nintendo DS,[Advance Wars: Dual Strike is the latest in th...,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,2,4.00,2,4.0,2,4.0,2,4.0,0,532,Video Games,Super Paper Mario,[The newest chapter of the Paper Mario story i...,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352..."
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,1,5.00,0,,0,,0,,0,648,Video Games,The World Ends With You,"[Product description, Welcome to Shibuya. Wake...","[Video Games, Legacy Systems, Nintendo Systems]",99.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352.0, 11..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,2,5.00,0,,0,,0,,19494,3626,Video Games,Darksiders III - Collector's Edition - Xbox One,[],"[Video Games, Xbox One, Games]",149.99,"[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433..."
958,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,5,4.80,0,,0,,0,,19494,4164,Video Games,Donkey Kong Country: Tropical Freeze - Nintend...,[Barrel-blast into a critically acclaimed Donk...,"[Video Games, Nintendo Switch, Games]",52.49,"[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400..."
959,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,3,4.00,0,,0,,0,,19494,4311,Video Games,Darksiders 2: Deathinitive Edition - Xbox One ...,"[What starts with War, ends in Death. Awakened...","[Video Games, Xbox One, Games]",14.99,"[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357..."
960,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,1,5.00,0,,0,,0,,19497,3990,Video Games,Mudrunner - American Wilds Edition - PlayStati...,[Mud Runner - American Wilds is the ultimate v...,"[Video Games, PlayStation 4, Games]",23.98,"[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3..."


# Load item sequence from Feature Store

## Test load

In [10]:
user_id = full_df[args.user_col].iloc[0]
logger.info(f"Test load with {user_id=}")

entity_df = pd.DataFrame.from_dict(
    {
        "user_id": [user_id],
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
        ],
    }
)

features = [
    "user_rating_stats:user_rating_cnt_90d",
    "user_rating_stats:user_rating_avg_prev_rating_90d",
    "user_rating_stats:user_rating_list_10_recent_asin",
]

store = FeatureStore(
    repo_path="..", fs_yaml_file="../feature_store_offline_server.yaml"
)
training_df = store.get_historical_features(entity_df, features).to_df()
training_df

[32m2024-10-27 23:21:54.878[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTest load with user_id='AE225O22SA7DLBOGOEIFL7FT5VYQ'[0m


Unnamed: 0,user_id,event_timestamp,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,2021-04-12 10:59:42,2,1.0,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M,B0..."


In [11]:
entity_df

Unnamed: 0,user_id,event_timestamp
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,2021-04-12 10:59:42


## Real load

In [12]:
%%time
features_df = store.get_historical_features(full_df[[args.user_col, args.timestamp_col]].drop_duplicates(), features).to_df()
assert features_df.duplicated().sum() == 0, display(features_df.loc[features_df.duplicated()])



Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.
CPU times: user 106 ms, sys: 58.9 ms, total: 165 ms
Wall time: 13.7 s


In [13]:
full_features_df = pd.merge(
    full_df, features_df, on=[args.user_col, args.timestamp_col], how="left"
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,...,item_indice,main_category,title,description,categories,price,item_sequence,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,1,5.00,1,5.0,0,,...,352,Video Games,World of Warcraft Battle Chest - (Obsolete),[Experience the World of Warcraft! World of Wa...,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,0,,0,,0,,...,1110,Video Games,World of Warcraft Collector's Edition,"[From the Manufacturer, We're very excited to ...","[Video Games, PC, Games]",999.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",2,5.0,B0006B7DXA
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,0,,0,,0,,...,387,Video Games,Advance Wars: Dual Strike - Nintendo DS,[Advance Wars: Dual Strike is the latest in th...,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,"B0006B7DXA,B001LETH2Q"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,2,4.00,2,4.0,2,4.0,...,532,Video Games,Super Paper Mario,[The newest chapter of the Paper Mario story i...,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,1,5.00,0,,0,,...,648,Video Games,The World Ends With You,"[Product description, Welcome to Shibuya. Wake...","[Video Games, Legacy Systems, Nintendo Systems]",99.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352.0, 11...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,2,5.00,0,,0,,...,3626,Video Games,Darksiders III - Collector's Edition - Xbox One,[],"[Video Games, Xbox One, Games]",149.99,"[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433...",1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0..."
165256,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,5,4.80,0,,0,,...,4164,Video Games,Donkey Kong Country: Tropical Freeze - Nintend...,[Barrel-blast into a critically acclaimed Donk...,"[Video Games, Nintendo Switch, Games]",52.49,"[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400...",2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0..."
165257,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,3,4.00,0,,0,,...,4311,Video Games,Darksiders 2: Deathinitive Edition - Xbox One ...,"[What starts with War, ends in Death. Awakened...","[Video Games, Xbox One, Games]",14.99,"[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357...",3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0..."
165258,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,1,5.00,0,,0,,...,3990,Video Games,Mudrunner - American Wilds Edition - PlayStati...,[Mud Runner - American Wilds is the ultimate v...,"[Video Games, PlayStation 4, Games]",23.98,"[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3...",1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0..."


In [14]:
logger.info(f"Eye-balling if the features are correct...")
full_features_df.loc[lambda df: df[args.user_col].eq(user_id)].sort_values(
    args.timestamp_col
)[
    [
        args.user_col,
        args.timestamp_col,
        args.item_col,
        "item_sequence",
        "user_rating_list_10_recent_asin",
    ]
]

[32m2024-10-27 23:22:10.246[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mEye-balling if the features are correct...[0m


Unnamed: 0,user_id,timestamp,parent_asin,item_sequence,user_rating_list_10_recent_asin
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,2004-09-14 03:38:36,B0006B7DXA,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,2004-10-13 23:01:27,B001LETH2Q,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",B0006B7DXA
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,2005-08-26 21:05:52,B0009XEC02,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","B0006B7DXA,B001LETH2Q"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,2007-04-13 21:47:03,B000NNDN1M,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352...","B0006B7DXA,B001LETH2Q,B0009XEC02"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,2008-08-13 18:33:22,B00136MBHA,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352.0, 11...","B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M"
5,AE225O22SA7DLBOGOEIFL7FT5VYQ,2013-03-17 23:51:07,B007VTVRFA,"[-1.0, -1.0, -1.0, -1.0, -1.0, 352.0, 1110.0, ...","B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M,B0..."
6,AE225O22SA7DLBOGOEIFL7FT5VYQ,2013-06-09 21:07:26,B0053BCML6,"[-1.0, -1.0, -1.0, -1.0, 352.0, 1110.0, 387.0,...","B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M,B0..."


In [15]:
def convert_asin_to_idx(inp: str, sequence_length=10, padding_value=-1):
    if inp is None:
        return [padding_value] * sequence_length
    asins = inp.split(",")
    indices = [idm.get_item_index(item_id) for item_id in asins]
    padding_needed = sequence_length - len(indices)
    output = np.pad(
        indices,
        (padding_needed, 0),  # Add padding at the beginning
        "constant",
        constant_values=padding_value,
    )
    return output

In [16]:
full_features_df = full_features_df.assign(
    item_sequence_fs=lambda df: df["user_rating_list_10_recent_asin"].apply(
        convert_asin_to_idx
    )
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,...,main_category,title,description,categories,price,item_sequence,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence_fs
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,1,5.00,1,5.0,0,,...,Video Games,World of Warcraft Battle Chest - (Obsolete),[Experience the World of Warcraft! World of Wa...,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,0,,0,,0,,...,Video Games,World of Warcraft Collector's Edition,"[From the Manufacturer, We're very excited to ...","[Video Games, PC, Games]",999.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",2,5.0,B0006B7DXA,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 352]"
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,0,,0,,0,,...,Video Games,Advance Wars: Dual Strike - Nintendo DS,[Advance Wars: Dual Strike is the latest in th...,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,"B0006B7DXA,B001LETH2Q","[-1, -1, -1, -1, -1, -1, -1, -1, 352, 1110]"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,2,4.00,2,4.0,2,4.0,...,Video Games,Super Paper Mario,[The newest chapter of the Paper Mario story i...,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02","[-1, -1, -1, -1, -1, -1, -1, 352, 1110, 387]"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,1,5.00,0,,0,,...,Video Games,The World Ends With You,"[Product description, Welcome to Shibuya. Wake...","[Video Games, Legacy Systems, Nintendo Systems]",99.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352.0, 11...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M","[-1, -1, -1, -1, -1, -1, 352, 1110, 387, 532]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,2,5.00,0,,0,,...,Video Games,Darksiders III - Collector's Edition - Xbox One,[],"[Video Games, Xbox One, Games]",149.99,"[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433...",1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0...","[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433..."
165256,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,5,4.80,0,,0,,...,Video Games,Donkey Kong Country: Tropical Freeze - Nintend...,[Barrel-blast into a critically acclaimed Donk...,"[Video Games, Nintendo Switch, Games]",52.49,"[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400...",2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0...","[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400..."
165257,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,3,4.00,0,,0,,...,Video Games,Darksiders 2: Deathinitive Edition - Xbox One ...,"[What starts with War, ends in Death. Awakened...","[Video Games, Xbox One, Games]",14.99,"[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357...",3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0...","[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357..."
165258,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,1,5.00,0,,0,,...,Video Games,Mudrunner - American Wilds Edition - PlayStati...,[Mud Runner - American Wilds is the ultimate v...,"[Video Games, PlayStation 4, Games]",23.98,"[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3...",1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0...","[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3..."


In [17]:
# There are cases that item_sequence transformed by Python is different than implemented SQL due to a bit of randomness when it comes to duplicated timestamp sampling
# Anyway we can ignore them if there are not many for now, the effort invested to solve this problem is deemed unworthy at the moment.
MISMATCH_THRESHOLD = 0.01


def check_item_sequence_wrong():
    return full_features_df["item_sequence"].apply(sum) != full_features_df[
        "item_sequence_fs"
    ].apply(sum)


assert (
    check_item_sequence_wrong().sum() <= len(full_features_df) * MISMATCH_THRESHOLD
), display(full_features_df.loc[check_item_sequence_wrong()].sample(1))

In [18]:
# This snipppet examines the cases where there are mismatch (look for match column = False)
if check_item_sequence_wrong().sum() > 0:
    user_id = full_features_df.loc[check_item_sequence_wrong()].sample(1)[args.user_col]
    with pd.option_context("display.max_colwidth", None):
        display(
            full_features_df.loc[lambda df: df[args.user_col].eq(user_id)]
            .sort_values(args.timestamp_col)[
                [
                    args.timestamp_col,
                    args.item_col,
                    "item_indice",
                    "item_sequence",
                    "item_sequence_fs",
                ]
            ]
            .assign(
                match=lambda df: df["item_sequence"].apply(sum)
                == df["item_sequence_fs"].apply(sum)
            )
        )

Unnamed: 0,timestamp,parent_asin,item_indice,item_sequence,item_sequence_fs,match
93378,2015-05-28 02:41:59,B003A0T6YS,1441,"[2187.0, 2040.0, 1749.0, 1320.0, 1322.0, 1021.0, 1419.0, 607.0, 1103.0, 2107.0]","[115, 2040, 1749, 1320, 1322, 1021, 1419, 607, 1103, 2107]",False


In [19]:
# Drop the Python impl. of item_sequence to use Feature Store instead
full_features_df = full_features_df.drop(columns=["item_sequence"]).rename(
    columns={"item_sequence_fs": "item_sequence"}
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,...,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,1,5.00,1,5.0,0,,...,352,Video Games,World of Warcraft Battle Chest - (Obsolete),[Experience the World of Warcraft! World of Wa...,"[Video Games, PC, Games]",,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,0,,0,,0,,...,1110,Video Games,World of Warcraft Collector's Edition,"[From the Manufacturer, We're very excited to ...","[Video Games, PC, Games]",999.99,2,5.0,B0006B7DXA,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 352]"
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,0,,0,,0,,...,387,Video Games,Advance Wars: Dual Strike - Nintendo DS,[Advance Wars: Dual Strike is the latest in th...,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,1,,"B0006B7DXA,B001LETH2Q","[-1, -1, -1, -1, -1, -1, -1, -1, 352, 1110]"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,2,4.00,2,4.0,2,4.0,...,532,Video Games,Super Paper Mario,[The newest chapter of the Paper Mario story i...,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,1,,"B0006B7DXA,B001LETH2Q,B0009XEC02","[-1, -1, -1, -1, -1, -1, -1, 352, 1110, 387]"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,1,5.00,0,,0,,...,648,Video Games,The World Ends With You,"[Product description, Welcome to Shibuya. Wake...","[Video Games, Legacy Systems, Nintendo Systems]",99.99,1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M","[-1, -1, -1, -1, -1, -1, 352, 1110, 387, 532]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,2,5.00,0,,0,,...,3626,Video Games,Darksiders III - Collector's Edition - Xbox One,[],"[Video Games, Xbox One, Games]",149.99,1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0...","[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433..."
165256,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,5,4.80,0,,0,,...,4164,Video Games,Donkey Kong Country: Tropical Freeze - Nintend...,[Barrel-blast into a critically acclaimed Donk...,"[Video Games, Nintendo Switch, Games]",52.49,2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0...","[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400..."
165257,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,3,4.00,0,,0,,...,4311,Video Games,Darksiders 2: Deathinitive Edition - Xbox One ...,"[What starts with War, ends in Death. Awakened...","[Video Games, Xbox One, Games]",14.99,3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0...","[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357..."
165258,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,1,5.00,0,,0,,...,3990,Video Games,Mudrunner - American Wilds Edition - PlayStati...,[Mud Runner - American Wilds is the ultimate v...,"[Video Games, PlayStation 4, Games]",23.98,1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0...","[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3..."


# Generate negative samples

In [20]:
meta_features = ["main_category", "title", "description", "categories", "price"]

item_timestamp_features = [
    "parent_asin_rating_cnt_365d",
    "parent_asin_rating_avg_prev_rating_365d",
    "parent_asin_rating_cnt_90d",
    "parent_asin_rating_avg_prev_rating_90d",
    "parent_asin_rating_cnt_30d",
    "parent_asin_rating_avg_prev_rating_30d",
    "parent_asin_rating_cnt_7d",
    "parent_asin_rating_avg_prev_rating_7d",
]

item_features_df = full_features_df.drop_duplicates(subset=[args.item_col])[
    [args.item_col, "item_indice", *meta_features]
]

In [21]:
features = [
    "item_sequence",
    "user_id",
    "user_rating_cnt_90d",
    "user_rating_avg_prev_rating_90d",
    "user_rating_list_10_recent_asin",
]

neg_df = generate_negative_samples(
    full_features_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    neg_label=0,
    neg_to_pos_ratio=args.neg_to_pos_ratio,
    seed=args.random_seed,
    features=features,
)

neg_df = neg_df.pipe(
    lambda df: pd.merge(
        df, item_features_df, how="left", on="item_indice", validate="m:1"
    )
)

  0%|          | 0/165260 [00:00<?, ?it/s]

# Get item timestamp feature  for negative samples
The newly generated negative samples would have the new (timestamp, item_indice). To be able to get the item timestamp features like item popularity we need to call feature store

In [22]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,parent_asin,main_category,title,description,categories,price
0,0,1444,0,2004-09-14 03:38:36.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",AE225O22SA7DLBOGOEIFL7FT5VYQ,1,,,B003A71XKQ,Video Games,KMD Xbox 360 Live Gaming Headset with Mic,[Take your Xbox Live experience to the next le...,[],6.99
1,0,374,0,2004-10-13 23:01:27.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 352]",AE225O22SA7DLBOGOEIFL7FT5VYQ,2,5.0,B0006B7DXA,B0009A4EV2,Video Games,Dragon Quest VIII: Journey of the Cursed King,"[From the Manufacturer, Dragon Quest VIII: Jou...","[Video Games, Legacy Systems, PlayStation Syst...",135.56
2,0,3074,0,2005-08-26 21:05:52.000,"[-1, -1, -1, -1, -1, -1, -1, -1, 352, 1110]",AE225O22SA7DLBOGOEIFL7FT5VYQ,1,,"B0006B7DXA,B001LETH2Q",B00ZM5OXD8,Video Games,Dishonored 2 - PlayStation 4,[Reprise your role as a supernatural assassin ...,"[Video Games, PlayStation 4, Games]",7.5
3,0,348,0,2007-04-13 21:47:03.000,"[-1, -1, -1, -1, -1, -1, -1, 352, 1110, 387]",AE225O22SA7DLBOGOEIFL7FT5VYQ,1,,"B0006B7DXA,B001LETH2Q,B0009XEC02",B000641ZC2,Video Games,Mario Power Tennis,[Mario Power Tennis brings the heroes and vill...,"[Video Games, Legacy Systems, Nintendo Systems...",57.72
4,0,776,0,2008-08-13 18:33:22.000,"[-1, -1, -1, -1, -1, -1, 352, 1110, 387, 532]",AE225O22SA7DLBOGOEIFL7FT5VYQ,1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M",B001ELJE5G,Video Games,Guitar Hero III: Legends of Rock - Xbox 360,"[Product description, Product InformationThe t...","[Video Games, Legacy Systems, Xbox Systems, Xb...",62.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,19494,507,0,2021-11-14 04:54:45.568,"[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433...",AHZKTZHKO3Z6UYWEYMH4YL52K3LA,1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0...",B000JVM256,Video Games,The Legend of Zelda: Majora's Mask - Collector...,[Game cartridge for the Nintendo 64 video game...,"[Video Games, Legacy Systems, Nintendo Systems...",120.0
165256,19494,2311,0,2021-11-14 04:55:34.529,"[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400...",AHZKTZHKO3Z6UYWEYMH4YL52K3LA,2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0...",B00B3PDHBU,Video Games,Turtle Beach Ear Force XP510 BS-2290-01 5.1 Wi...,"[For the ultimate in gaming immersion, you nee...","[Video Games, Legacy Systems, PlayStation Syst...",
165257,19494,2456,0,2022-02-05 09:31:46.803,"[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357...",AHZKTZHKO3Z6UYWEYMH4YL52K3LA,3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0...",B00DQNF3PU,Video Games,Zumba Fitness World Party - Xbox One,[Only the premier global fitness brand can off...,"[Video Games, Xbox One, Games]",54.9
165258,19497,2126,0,2021-10-17 23:51:36.799,"[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3...",AHZLVBGFP4FNOJGC33CZQSHUQXWA,1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0...",B0088TN7NW,Video Games,Just Dance 4,"[From the Manufacturer, Throw the ultimate par...","[Video Games, Legacy Systems, Xbox Systems]",32.95


In [23]:
%%time
ts_features = [f"parent_asin_rating_stats:{feature}" for feature in item_timestamp_features]

neg_ts_features_df = store.get_historical_features(neg_df[[args.item_col, args.timestamp_col]].drop_duplicates(), ts_features).to_df()
assert neg_ts_features_df.duplicated().sum() == 0, display(neg_ts_features_df.loc[neg_ts_features_df.duplicated()])



Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.
CPU times: user 100 ms, sys: 69.2 ms, total: 169 ms
Wall time: 45.3 s


In [24]:
neg_df = pd.merge(
    neg_df, neg_ts_features_df, on=[args.item_col, args.timestamp_col], how="left"
)
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,parent_asin,...,categories,price,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d
0,0,1444,0,2004-09-14 03:38:36.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",AE225O22SA7DLBOGOEIFL7FT5VYQ,1,,,B003A71XKQ,...,[],6.99,,,,,,,,
1,0,374,0,2004-10-13 23:01:27.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 352]",AE225O22SA7DLBOGOEIFL7FT5VYQ,2,5.0,B0006B7DXA,B0009A4EV2,...,"[Video Games, Legacy Systems, PlayStation Syst...",135.56,,,,,,,,
2,0,3074,0,2005-08-26 21:05:52.000,"[-1, -1, -1, -1, -1, -1, -1, -1, 352, 1110]",AE225O22SA7DLBOGOEIFL7FT5VYQ,1,,"B0006B7DXA,B001LETH2Q",B00ZM5OXD8,...,"[Video Games, PlayStation 4, Games]",7.5,,,,,,,,
3,0,348,0,2007-04-13 21:47:03.000,"[-1, -1, -1, -1, -1, -1, -1, 352, 1110, 387]",AE225O22SA7DLBOGOEIFL7FT5VYQ,1,,"B0006B7DXA,B001LETH2Q,B0009XEC02",B000641ZC2,...,"[Video Games, Legacy Systems, Nintendo Systems...",57.72,1.0,5.000,0.0,,0.0,,0.0,
4,0,776,0,2008-08-13 18:33:22.000,"[-1, -1, -1, -1, -1, -1, 352, 1110, 387, 532]",AE225O22SA7DLBOGOEIFL7FT5VYQ,1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M",B001ELJE5G,...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",62.99,16.0,3.875,4.0,4.75,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,19494,507,0,2021-11-14 04:54:45.568,"[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433...",AHZKTZHKO3Z6UYWEYMH4YL52K3LA,1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0...",B000JVM256,...,"[Video Games, Legacy Systems, Nintendo Systems...",120.0,0.0,,0.0,,0.0,,0.0,
165256,19494,2311,0,2021-11-14 04:55:34.529,"[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400...",AHZKTZHKO3Z6UYWEYMH4YL52K3LA,2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0...",B00B3PDHBU,...,"[Video Games, Legacy Systems, PlayStation Syst...",,0.0,,0.0,,0.0,,0.0,
165257,19494,2456,0,2022-02-05 09:31:46.803,"[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357...",AHZKTZHKO3Z6UYWEYMH4YL52K3LA,3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0...",B00DQNF3PU,...,"[Video Games, Xbox One, Games]",54.9,0.0,,0.0,,0.0,,0.0,
165258,19497,2126,0,2021-10-17 23:51:36.799,"[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3...",AHZLVBGFP4FNOJGC33CZQSHUQXWA,1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0...",B0088TN7NW,...,"[Video Games, Legacy Systems, Xbox Systems]",32.95,1.0,5.000,1.0,5.00,0.0,,0.0,


# Concating positive data with negative samples

In [25]:
full_features_df = (
    pd.concat([full_features_df, neg_df], axis=0)
    .reset_index(drop=True)
    .sample(frac=1, replace=False, random_state=args.random_seed)
)

In [26]:
key_cols = [
    args.user_col,
    args.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    args.rating_col,
    args.timestamp_col,
]
assert (
    full_features_df[key_cols].isna().sum().sum() == 0
), "Null values found at key colums"

In [27]:
val_timestamp

Timestamp('2021-08-10 21:28:25.764000')

# Split back train test

In [28]:
to_unix_ts = lambda s: s.astype("int64") // 10**6
train_neg_df = full_features_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_features_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [29]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,...,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
251552,AG57LGJFCNNQJ6P6ABQAVUKXDUDA,B0015AARJI,0.0,2016-01-12 11:59:11.000,76.0,4.592105,10.0,4.3,3.0,5.0,...,660,Video Games,PlayStation 3 Dualshock 3 Wireless Controller ...,"[Amazon.com, The Dualshock 3 wireless controll...","[Video Games, Legacy Systems, PlayStation Syst...",49.99,2,5.000000,B00J00BLRM,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2662]"
325041,AHWG4EGOV5ZDKPETL56MAYGPLJRQ,B0BMGHMP23,0.0,2016-04-18 19:26:20.000,,,,,,,...,4568,Computers,Logitech G502 Lightspeed Wireless Gaming Mouse...,[G502 is the best gaming mouse from Logitech G...,"[Video Games, PC, Accessories, Gaming Mice]",87.95,3,5.000000,"B00YOGZFCO,B00KWFCSB2,B00L3LQ1FI,B0151K6J9Y,B0...","[3028, 2742, 2755, 3159, 3101, 3036, 3051, 313..."
293086,AH5PTZ2U74OZ3HT6QVUWM4CV6OVQ,B009AP23NI,0.0,2016-02-10 18:45:08.000,9.0,4.666667,0.0,,0.0,,...,2219,Video Games,Nintendo Wii U Pro U Controller (Japanese Vers...,[Wii U PRO controller (black) (WUP-A-RSKA)],"[Video Games, Legacy Systems, Nintendo Systems...",43.99,8,4.428571,"B0199OXR0W,B00EVPR4FY,B00B7ELWAU,B00UH9DN58,B0...","[-1, -1, 3234, 2508, 2318, 2964, 1258, 2439, 4..."
52027,AFC5XTCF5D7J3NSDITB2Z26XWWYA,B001E8WQUY,5.0,2019-05-01 21:22:39.265,0.0,,0.0,,0.0,,...,724,Video Games,Rock Band 2 - Nintendo Wii (Game only),"[Product description, Rock Band 2 lets you and...","[Video Games, Legacy Systems, Nintendo Systems...",28.49,1,,"B006HZA6VK,B0BN2FNKLM,B0086VPUHI,B0040UAYI4,B0...","[1987, 4569, 2114, 1606, 2159, 2279, 2447, 441..."
48913,AF7LJQOIWF3Y3YD7SGOJ34MA5JPA,B001E8WQKY,5.0,2015-01-09 12:53:25.000,16.0,4.375000,8.0,4.5,4.0,4.5,...,722,Video Games,Resident Evil 5 - Xbox 360,[],"[Video Games, Legacy Systems, Xbox Systems, Xb...",29.88,3,5.000000,"B00A2ML6XG,B003VUO6LU","[-1, -1, -1, -1, -1, -1, -1, -1, 2261, 1579]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250960,AG4RATLNVLOKZCPXN67HKOAK65CA,B078FBVJMB,0.0,2015-10-31 18:25:09.000,,,,,,,...,3829,Video Games,A Way Out – PC Origin [Online Game Code],[From the creators of Brothers - A Tale of Two...,"[Video Games, PC, Games]",5.99,1,,B00TFVD688,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2951]"
217058,AFBXO3BFWBJX6QS5NW73O37IXF2A,B0771ZXXV6,0.0,2011-03-08 02:06:38.000,,,,,,,...,3806,Video Games,Nintendo Joy-Con (R) - Neon Red - Nintendo Switch,[To be determined],"[Video Games, Nintendo Switch, Accessories, Co...",,3,4.000000,"B003JVCA9Q,B0029NZ4HA","[-1, -1, -1, -1, -1, -1, -1, -1, 1488, 1199]"
323468,AHVANA5GZNJ45UABPXWZNAF4ECBQ,B00BBF6MO6,0.0,2015-02-15 05:31:04.000,3.0,4.666667,0.0,,0.0,,...,2327,Video Games,Killer is Dead - Xbox 360,[Killer Is Dead is the latest title from the d...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",39.82,1,,"B002L93F0A,B002KJ02ZC,B001H4NMNA","[-1, -1, -1, -1, -1, -1, -1, 1377, 1374, 1092]"
132003,AHAVA5VKMJ3OMOLGDZ3W45CKXEWA,B00KTORA0K,5.0,2019-05-25 04:03:51.505,3.0,4.666667,1.0,5.0,1.0,5.0,...,2726,Video Games,Just Dance 2015 - Wii,[With more than 50 million copies of Just Danc...,"[Video Games, Legacy Systems, Nintendo Systems...",33.0,2,5.000000,"B004AYCNR0,B007NUQICE,B000TYQL1O,B000SEU92W,B0...","[-1, -1, -1, 1657, 2074, 593, 583, 3715, 3448,..."


In [30]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,...,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
165059,AHAQV7A4Z2Z5NJLIPMEPPXDNIRWQ,B07VHHPJBV,2.0,2022-04-10 16:30:15.820,7.0,4.000000,0.0,,0.0,,...,4204,Computers,"Redragon M602 Griffin RGB Gaming Mouse, RGB Sp...",[Redragon M602 (White) GRIFFIN High-Precision ...,"[Video Games, PC, Accessories, Gaming Mice]",19.99,1,,"B00AEWTSWU,B003ELPQJM,B0111ZC4OC,B00E4MQODC,B0...","[-1, -1, -1, -1, -1, 2275, 1464, 3096, 2478, 3..."
330244,AGVAG2GSFQZUAXMRSKKSGKEHGG5A,B00XR3YBM0,0.0,2022-01-19 20:57:42.960,1.0,1.000000,1.0,1.00,0.0,,...,3012,Video Games,WWE 2K16 - PlayStation 4,"[WWE 2K16, Get in the ring and Raise Some Hell...","[Video Games, PlayStation 4, Games]",63.45,2,5.0,"B004LVO4M4,B00004YRQA,B005FMLZQQ,B074G34RY2,B0...","[-1, -1, -1, 1736, 117, 1911, 3747, 4598, 3455..."
164871,AGHUHJSJLGWPS3JE2FD7D5GJODWQ,B00DBDPOZ4,5.0,2022-01-16 16:27:51.164,4.0,4.750000,0.0,,0.0,,...,2428,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,1,,"B00006IR62,B0B4CRTWGM,B000TYQL1O,9629971372,B0...","[-1, -1, -1, 228, 4526, 593, 2, 711, 790, 862]"
330291,AH4TWYG3FCAEPNJXS6E7KR247YMQ,B000034DC6,0.0,2021-11-25 20:51:44.194,1.0,5.000000,1.0,5.00,1.0,5.0,...,63,Video Games,Twisted Metal 4,"[Product description, Sweet Tooth and his pump...","[Video Games, Legacy Systems, PlayStation Syst...",69.98,1,,"B00004YRQA,B00006F2ZP,B000TLU67W,B000B69E9G,B0...","[-1, -1, 117, 215, 589, 404, 2849, 981, 2718, ..."
329619,AEBEAZUAX3HMA7EF3BA6L2DK3LPA,B008HPAXZ2,0.0,2022-06-25 22:48:11.729,0.0,,0.0,,0.0,,...,2157,Video Games,HORI Nintendo 3DS XL Screen Protective Filter,[Officially licensed by Nintendo. This is the ...,"[Video Games, Legacy Systems, Nintendo Systems...",,2,5.0,"B06XMRQ68B,B0BLXJC8MZ,B0748N6796,B0BS9YCBYY,B0...","[-1, -1, -1, 3577, 4566, 3740, 4579, 4554, 400..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330199,AGOX6SXJ74WKBFGBKKZTR6PNOICA,B004Q9SO4K,0.0,2022-03-05 06:30:59.489,0.0,,0.0,,0.0,,...,1754,Video Games,NCAA Football 12 - Xbox 360,"[Product Description, NCAA Football 12 takes t...","[Video Games, Legacy Systems, Xbox Systems, Xb...",43.56,1,,"B01E10MWDW,B013HMN66M,B01GY35HKE,B07DLNHQZX,B0...","[-1, -1, -1, -1, 3312, 3126, 3373, 3944, 4130,..."
164950,AGRGN2RA4EOW3T4GFI4H76WHUGSA,B07JK9DFKH,5.0,2022-02-12 04:30:57.128,8.0,4.000000,1.0,3.00,1.0,3.0,...,4022,Cell Phones & Accessories,REAKA Upgraded USB Type C to HDMI Digital AV M...,[],"[Video Games, Nintendo Switch, Accessories, Ca...",22.99,1,,"B00PDRZG9U,B0728CRKP2,B073SQKLGW,B01K1OO5PU,B0...","[2890, 3687, 3719, 3423, 4113, 2911, 4213, 434..."
164643,AFL2OB53OGEIQCA4CMFCTQV3FJJA,B00BCX2AZW,5.0,2022-01-13 15:10:43.458,0.0,,0.0,,0.0,,...,2332,Video Games,Monster Hunter 3 Ultimate - Nintendo Wii U,[The popular fantasy action franchise Monster ...,"[Video Games, Legacy Systems, Nintendo Systems...",54.35,1,,"B08CL92HPG,B01N2V26CN,B002I0K3CK,B00DC7G2W8,B0...","[-1, -1, -1, -1, -1, 4367, 3520, 1316, 2436, 2..."
330048,AG6WZATM27WOFLQY4435O3P52SWA,B08JHZHWZ3,0.0,2021-11-23 19:07:31.054,71.0,4.084507,4.0,4.25,1.0,2.0,...,4388,Video Games,"Super Mario 3D All-Stars - Nintendo Switch, 17...",[Play three of Mario’s greatest 3D platforming...,"[Video Games, Nintendo Switch, Games]",,2,5.0,"B0793GSKLV,B00XY2UYUW,B07YBXFF5C,B077GG9D5D,B0...","[3844, 3017, 4284, 3813, 3345, 2376, 4629, 352..."


# Checks

In [31]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
assert (
    check_df[args.rating_col].gt(0).sum() * (args.neg_to_pos_ratio + 1)
    == check_df.shape[0]
), "Unexpected number of pos and neg samples"

[32m2024-10-27 23:24:03.866[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AHSMK75Q6UN55FEZEDBUJ4ZKYXAA...[0m


In [32]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert (
    train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5
), f"Item {item} does not appear much in training data"

[32m2024-10-27 23:24:03.878[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B07RBMZRP3...[0m


## Random eye-ball

In [33]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,...,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
154694,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B00Q6DC96S,5.0,2015-06-13 19:20:14.000,1.0,1.0,1.0,1.0,0.0,,...,2909,Video Games,Dark Souls II: Scholar of the First Sin - Xbox...,"[The definitive edition of DARK SOULS II, incl...","[Video Games, Xbox One, Games]",,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
319954,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B00CISMP8M,0.0,2015-06-13 19:20:14.000,69.0,4.73913,6.0,5.0,0.0,,...,2390,Video Games,FIFA 14 - Xbox 360,[Experience the emotion of scoring great goals...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",25.0,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
319955,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B000084318,0.0,2020-04-13 00:58:41.238,1.0,5.0,0.0,,0.0,,...,248,Video Games,The Legend of Zelda: The Wind Waker,"[Product Description, Continue Link's adventur...","[Video Games, Legacy Systems, Nintendo Systems...",116.98,1,,B00Q6DC96S,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2909]"
154695,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B07624RBWB,5.0,2020-04-13 00:58:41.238,84.0,4.714286,21.0,4.714286,2.0,4.5,...,3788,Video Games,Nintendo Switch Pro Controller,[],"[Video Games, Nintendo Switch, Accessories, Co...",69.0,1,,B00Q6DC96S,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2909]"
154696,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B09ZTV42CQ,5.0,2020-07-22 02:00:07.233,4.0,4.5,1.0,5.0,0.0,,...,4520,Video Games,8Bitdo Adapter 2 USB Wireless Switch Controlle...,[],"[Video Games, Nintendo Switch]",19.99,1,,"B00Q6DC96S,B07624RBWB","[-1, -1, -1, -1, -1, -1, -1, -1, 2909, 3788]"
319956,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B014R4KYMS,0.0,2020-07-22 02:00:07.233,10.0,4.9,2.0,5.0,0.0,,...,3151,Video Games,Uncharted 4: A Thief's End - PlayStation 4,[Uncharted comes to the PlayStation 4.Uncharte...,"[Video Games, PlayStation 4, Games]",24.99,1,,"B00Q6DC96S,B07624RBWB","[-1, -1, -1, -1, -1, -1, -1, -1, 2909, 3788]"
154697,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B0C3KYVDWT,5.0,2020-07-22 02:13:37.412,75.0,4.68,15.0,4.466667,5.0,4.2,...,4607,Computers,"SanDisk 128GB microSDXC-Card, Licensed for Nin...","[With incredible speed, the officially license...","[Video Games, Nintendo Switch, Accessories]",14.99,2,5.0,"B00Q6DC96S,B07624RBWB,B09ZTV42CQ","[-1, -1, -1, -1, -1, -1, -1, 2909, 3788, 4520]"
319957,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B0036EWMIK,0.0,2020-07-22 02:13:37.412,4.0,4.5,1.0,5.0,0.0,,...,1426,Video Games,Castlevania: Lords of Shadow,[Castlevania – Lords of Shadow is a dark and v...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",42.2,2,5.0,"B00Q6DC96S,B07624RBWB,B09ZTV42CQ","[-1, -1, -1, -1, -1, -1, -1, 2909, 3788, 4520]"
319958,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B072K62L3S,0.0,2020-11-21 00:43:30.613,0.0,,0.0,,0.0,,...,3695,Video Games,Super Lucky's Tale: Standard Edition - Xbox On...,[Welcome to Adventure! “Super Lucky’s Tale” is...,"[Video Games, Xbox One, Games]",19.99,1,,"B00Q6DC96S,B07624RBWB,B09ZTV42CQ,B0C3KYVDWT","[-1, -1, -1, -1, -1, -1, 2909, 3788, 4520, 4607]"
154698,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B08D3XL1KF,5.0,2020-11-21 00:43:30.613,35.0,4.857143,2.0,5.0,0.0,,...,4368,Video Games,Super Smash Bros. Ultimate: Challenger Pack 2 ...,[The Hero from the DRAGON QUEST XI game joins ...,"[Video Games, Nintendo Switch, Games]",5.99,1,,"B00Q6DC96S,B07624RBWB,B09ZTV42CQ,B0C3KYVDWT","[-1, -1, -1, -1, -1, -1, 2909, 3788, 4520, 4607]"


In [34]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,...,item_indice,main_category,title,description,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
165194,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B07RBMZRP3,5.0,2021-11-11 05:18:12.971,2.0,5.0,0.0,,0.0,,...,4156,Video Games,Dark Souls II: Scholar of the First Sin - Play...,[Prepare to Die again in the complete Dark Sou...,"[Video Games, PlayStation 4, Games]",16.37,1,,"B00Q6DC96S,B07624RBWB,B09ZTV42CQ,B0C3KYVDWT,B0...","[-1, -1, 2909, 3788, 4520, 4607, 4368, 788, 12..."
330454,AHSMK75Q6UN55FEZEDBUJ4ZKYXAA,B001JKTC9A,0.0,2021-11-11 05:18:12.971,1.0,5.0,0.0,,0.0,,...,1103,Video Games,Uncharted 2: Among Thieves - Playstation 3,"[Product Description, Uncharted 2: Among Thiev...","[Video Games, Legacy Systems, PlayStation Syst...",19.99,1,,"B00Q6DC96S,B07624RBWB,B09ZTV42CQ,B0C3KYVDWT,B0...","[-1, -1, 2909, 3788, 4520, 4607, 4368, 788, 12..."


# Persist

In [35]:
full_features_df.to_parquet(
    "../data/full_features_neg_sampling_df.parquet", index=False
)
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)