# Negative sampling

More label data can tremendously help model to learn. In this notebook we would prepare negative samples for each user based on the unseen items. The unseen items are sampled based on how frequent they appear in the training dataset. This is an intentional choice to make the negative sample harder, hence potentially more useful. This would also force the model learn relevant patterns about user behaviors rather than biased by popularity.

# Set up

In [1]:
import os
import sys
from datetime import datetime

import numpy as np
import pandas as pd
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.data_prep_utils import handle_dtypes, parse_dt
from src.id_mapper import IDMapper
from src.ranker.negative_sampling import generate_negative_samples

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-sequence-modeling"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    neg_to_pos_ratio: int = 3

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-sequence-modeling",
  "notebook_persist_dp": "/Users/dvq/frostmourne/recsys-mvp/notebooks/data/000-sequence-modeling",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "neg_to_pos_ratio": 3
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

In [4]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df, neg_to_pos_ratio=5)

  0%|          | 0/7 [00:00<?, ?it/s]

In [6]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,105,0,1
0,1,106,0,1
0,1,104,0,1
1,1,105,0,2
1,1,104,0,2
1,1,106,0,2
2,1,104,0,4
2,1,106,0,4
2,1,105,0,4
3,2,103,0,1


# Load data

In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")
idm = IDMapper().load("../data/idm.json")

In [8]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-10-27 15:07:14.153[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=1628630904765[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0).pipe(parse_dt).pipe(handle_dtypes)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,categories,price,item_sequence
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,0,352,Video Games,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,0,1110,Video Games,"[Video Games, PC, Games]",999.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,0,387,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,0,532,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352..."
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,0,648,Video Games,"[Video Games, Legacy Systems, Nintendo Systems]",99.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352.0, 11..."
...,...,...,...,...,...,...,...,...,...,...
957,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,19494,3626,Video Games,"[Video Games, Xbox One, Games]",149.99,"[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433..."
958,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,19494,4164,Video Games,"[Video Games, Nintendo Switch, Games]",52.49,"[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400..."
959,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,19494,4311,Video Games,"[Video Games, Xbox One, Games]",14.99,"[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357..."
960,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,19497,3990,Video Games,"[Video Games, PlayStation 4, Games]",23.98,"[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3..."


# Load item sequence from Feature Store

## Test load

In [10]:
user_id = full_df[args.user_col].iloc[0]
logger.info(f"Test load with {user_id=}")

entity_df = pd.DataFrame.from_dict(
    {
        "user_id": [user_id],
        "event_timestamp": [
            datetime(2021, 4, 12, 10, 59, 42),
        ],
    }
)

features = [
    "user_rating_stats:user_rating_cnt_90d",
    "user_rating_stats:user_rating_avg_prev_rating_90d",
    "user_rating_stats:user_rating_list_10_recent_asin",
]

store = FeatureStore(
    repo_path="..", fs_yaml_file="../feature_store_offline_server.yaml"
)
training_df = store.get_historical_features(entity_df, features).to_df()
training_df

[32m2024-10-27 15:07:14.194[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTest load with user_id='AE225O22SA7DLBOGOEIFL7FT5VYQ'[0m


Unnamed: 0,user_id,event_timestamp,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,2021-04-12 10:59:42,2,1.0,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M,B0..."


In [11]:
entity_df

Unnamed: 0,user_id,event_timestamp
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,2021-04-12 10:59:42


## Real load

In [12]:
%%time
features_df = store.get_historical_features(full_df[[args.user_col, args.timestamp_col]].drop_duplicates(), features).to_df()
assert features_df.duplicated().sum() == 0, display(features_df.loc[features_df.duplicated()])



Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.
CPU times: user 87.4 ms, sys: 33.2 ms, total: 121 ms
Wall time: 8.52 s


In [13]:
full_features_df = pd.merge(
    full_df, features_df, on=[args.user_col, args.timestamp_col], how="left"
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,categories,price,item_sequence,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,0,352,Video Games,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,0,1110,Video Games,"[Video Games, PC, Games]",999.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",2,5.0,B0006B7DXA
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,0,387,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,"B0006B7DXA,B001LETH2Q"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,0,532,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,0,648,Video Games,"[Video Games, Legacy Systems, Nintendo Systems]",99.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352.0, 11...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,19494,3626,Video Games,"[Video Games, Xbox One, Games]",149.99,"[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433...",1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0..."
165256,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,19494,4164,Video Games,"[Video Games, Nintendo Switch, Games]",52.49,"[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400...",2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0..."
165257,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,19494,4311,Video Games,"[Video Games, Xbox One, Games]",14.99,"[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357...",3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0..."
165258,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,19497,3990,Video Games,"[Video Games, PlayStation 4, Games]",23.98,"[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3...",1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0..."


In [14]:
logger.info(f"Eye-balling if the features are correct...")
full_features_df.loc[lambda df: df[args.user_col].eq(user_id)].sort_values(
    args.timestamp_col
)[
    [
        args.user_col,
        args.timestamp_col,
        args.item_col,
        "item_sequence",
        "user_rating_list_10_recent_asin",
    ]
]

[32m2024-10-27 15:07:23.214[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mEye-balling if the features are correct...[0m


Unnamed: 0,user_id,timestamp,parent_asin,item_sequence,user_rating_list_10_recent_asin
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,2004-09-14 03:38:36,B0006B7DXA,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,2004-10-13 23:01:27,B001LETH2Q,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",B0006B7DXA
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,2005-08-26 21:05:52,B0009XEC02,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","B0006B7DXA,B001LETH2Q"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,2007-04-13 21:47:03,B000NNDN1M,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352...","B0006B7DXA,B001LETH2Q,B0009XEC02"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,2008-08-13 18:33:22,B00136MBHA,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352.0, 11...","B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M"
5,AE225O22SA7DLBOGOEIFL7FT5VYQ,2013-03-17 23:51:07,B007VTVRFA,"[-1.0, -1.0, -1.0, -1.0, -1.0, 352.0, 1110.0, ...","B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M,B0..."
6,AE225O22SA7DLBOGOEIFL7FT5VYQ,2013-06-09 21:07:26,B0053BCML6,"[-1.0, -1.0, -1.0, -1.0, 352.0, 1110.0, 387.0,...","B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M,B0..."


In [15]:
def convert_asin_to_idx(inp: str, sequence_length=10, padding_value=-1):
    if inp is None:
        return [padding_value] * sequence_length
    asins = inp.split(",")
    indices = [idm.get_item_index(item_id) for item_id in asins]
    padding_needed = sequence_length - len(indices)
    output = np.pad(
        indices,
        (padding_needed, 0),  # Add padding at the beginning
        "constant",
        constant_values=padding_value,
    )
    return output

In [16]:
full_features_df = full_features_df.assign(
    item_sequence_fs=lambda df: df["user_rating_list_10_recent_asin"].apply(
        convert_asin_to_idx
    )
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,categories,price,item_sequence,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence_fs
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,0,352,Video Games,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,0,1110,Video Games,"[Video Games, PC, Games]",999.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",2,5.0,B0006B7DXA,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 352]"
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,0,387,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",1,,"B0006B7DXA,B001LETH2Q","[-1, -1, -1, -1, -1, -1, -1, -1, 352, 1110]"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,0,532,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02","[-1, -1, -1, -1, -1, -1, -1, 352, 1110, 387]"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,0,648,Video Games,"[Video Games, Legacy Systems, Nintendo Systems]",99.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 352.0, 11...",1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M","[-1, -1, -1, -1, -1, -1, 352, 1110, 387, 532]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,19494,3626,Video Games,"[Video Games, Xbox One, Games]",149.99,"[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433...",1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0...","[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433..."
165256,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,19494,4164,Video Games,"[Video Games, Nintendo Switch, Games]",52.49,"[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400...",2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0...","[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400..."
165257,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,19494,4311,Video Games,"[Video Games, Xbox One, Games]",14.99,"[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357...",3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0...","[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357..."
165258,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,19497,3990,Video Games,"[Video Games, PlayStation 4, Games]",23.98,"[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3...",1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0...","[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3..."


In [17]:
# There are cases that item_sequence transformed by Python is different than implemented SQL due to a bit of randomness when it comes to duplicated timestamp sampling
# Anyway we can ignore them if there are not many for now, the effort invested to solve this problem is deemed unworthy at the moment.
MISMATCH_THRESHOLD = 0.01

def check_item_sequence_wrong():
    return full_features_df["item_sequence"].apply(sum) != full_features_df[
        "item_sequence_fs"
    ].apply(sum)


assert (
    check_item_sequence_wrong().sum() <= len(full_features_df) * MISMATCH_THRESHOLD
), display(full_features_df.loc[check_item_sequence_wrong()].sample(1))

In [18]:
# This snipppet examines the cases where there are mismatch (look for match column = False)
if check_item_sequence_wrong().sum() > 0:
    user_id = full_features_df.loc[check_item_sequence_wrong()].sample(1)[args.user_col]
    with pd.option_context("display.max_colwidth", None):
        display(
            full_features_df.loc[lambda df: df[args.user_col].eq(user_id)]
            .sort_values(args.timestamp_col)[
                [
                    args.timestamp_col,
                    args.item_col,
                    "item_indice",
                    "item_sequence",
                    "item_sequence_fs",
                ]
            ]
            .assign(
                match=lambda df: df["item_sequence"].apply(sum)
                == df["item_sequence_fs"].apply(sum)
            )
        )

Unnamed: 0,timestamp,parent_asin,item_indice,item_sequence,item_sequence_fs,match
16804,2017-04-11 17:47:02,B00DJRLDMU,2448,"[2572.0, 2913.0, 3298.0, 3398.0, 3610.0, 3373.0, 3216.0, 734.0, 3402.0, 2441.0]","[2717, 2913, 3298, 3398, 3610, 3373, 3216, 734, 3402, 2441]",False


In [19]:
# Drop the Python impl. of item_sequence to use Feature Store instead
full_features_df = full_features_df.drop(columns=["item_sequence"]).rename(
    columns={"item_sequence_fs": "item_sequence"}
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
0,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0006B7DXA,5.0,2004-09-14 03:38:36.000,0,352,Video Games,"[Video Games, PC, Games]",,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
1,AE225O22SA7DLBOGOEIFL7FT5VYQ,B001LETH2Q,5.0,2004-10-13 23:01:27.000,0,1110,Video Games,"[Video Games, PC, Games]",999.99,2,5.0,B0006B7DXA,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 352]"
2,AE225O22SA7DLBOGOEIFL7FT5VYQ,B0009XEC02,5.0,2005-08-26 21:05:52.000,0,387,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,1,,"B0006B7DXA,B001LETH2Q","[-1, -1, -1, -1, -1, -1, -1, -1, 352, 1110]"
3,AE225O22SA7DLBOGOEIFL7FT5VYQ,B000NNDN1M,5.0,2007-04-13 21:47:03.000,0,532,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",49.99,1,,"B0006B7DXA,B001LETH2Q,B0009XEC02","[-1, -1, -1, -1, -1, -1, -1, 352, 1110, 387]"
4,AE225O22SA7DLBOGOEIFL7FT5VYQ,B00136MBHA,5.0,2008-08-13 18:33:22.000,0,648,Video Games,"[Video Games, Legacy Systems, Nintendo Systems]",99.99,1,,"B0006B7DXA,B001LETH2Q,B0009XEC02,B000NNDN1M","[-1, -1, -1, -1, -1, -1, 352, 1110, 387, 532]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B0716CXJ1R,5.0,2021-11-14 04:54:45.568,19494,3626,Video Games,"[Video Games, Xbox One, Games]",149.99,1,,"B071YZVS75,B07DML9W41,B00V5V3E38,B01N6QKT7H,B0...","[3670, 3945, 2970, 3537, 4407, 4453, 4002, 433..."
165256,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B07SM7G9CN,5.0,2021-11-14 04:55:34.529,19494,4164,Video Games,"[Video Games, Nintendo Switch, Games]",52.49,2,5.0,"B07DML9W41,B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B0...","[3945, 2970, 3537, 4407, 4453, 4002, 4337, 400..."
165257,AHZKTZHKO3Z6UYWEYMH4YL52K3LA,B081W1VBKN,5.0,2022-02-05 09:31:46.803,19494,4311,Video Games,"[Video Games, Xbox One, Games]",14.99,3,5.0,"B00V5V3E38,B01N6QKT7H,B08MBQ51KG,B094WQR3H3,B0...","[2970, 3537, 4407, 4453, 4002, 4337, 4006, 357..."
165258,AHZLVBGFP4FNOJGC33CZQSHUQXWA,B07H53PZY8,4.0,2021-10-17 23:51:36.799,19497,3990,Video Games,"[Video Games, PlayStation 4, Games]",23.98,1,,"B001EYUU4W,B001CU4EMW,B000VTQ3LU,B00YOGZFCO,B0...","[983, 703, 616, 3028, 660, 3599, 4534, 2804, 3..."


# Generate negative samples

In [20]:
item_features_df = (
    full_features_df.drop_duplicates(subset=[args.item_col])
    [[args.item_col, 'item_indice', 'main_category', 'categories', 'price']]
)

In [21]:
features = [
    "item_sequence",
    "user_id",
    "user_rating_cnt_90d",
    "user_rating_avg_prev_rating_90d",
    "user_rating_list_10_recent_asin",
]

neg_df = generate_negative_samples(
    full_features_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    neg_label=0,
    neg_to_pos_ratio=args.neg_to_pos_ratio,
    seed=args.random_seed,
    features=features
)

neg_df = (
    neg_df
    .pipe(
        lambda df:
        pd.merge(
            df,
            item_features_df,
            how='left',
            on='item_indice',
            validate="m:1"
        )
    )
)

  0%|          | 0/165260 [00:00<?, ?it/s]

# Concating positive data with negative samples

In [22]:
full_features_df = pd.concat([full_features_df, neg_df], axis=0).reset_index(drop=True).sample(
    frac=1, replace=False, random_state=args.random_seed
)

In [23]:
key_cols = [
    args.user_col,
    args.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    args.rating_col,
    args.timestamp_col,
]
assert (
    full_features_df[key_cols].isna().sum().sum() == 0
), "Null values found at key colums"

In [24]:
val_timestamp

1628630904765

# Split back train test

In [25]:
to_unix_ts = lambda s: s.astype("int64") // 10**6
train_neg_df = full_features_df.loc[
    lambda df: df[args.timestamp_col].pipe(to_unix_ts).lt(val_timestamp)
]
val_neg_df = full_features_df.loc[
    lambda df: df[args.timestamp_col].pipe(to_unix_ts).ge(val_timestamp)
]

In [26]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
230493,AEKSUPM7CH53J3G5PA3JLWLJXUMQ,B00QXJFDZO,0.0,2017-10-30 14:23:22.389,2561,2919,Video Games,"[Video Games, PlayStation 4, Games]",,2,5.000000,"B005FVBYV8,B003FMTZSI,B01MS6WG9S,B073W2T5F6","[-1, -1, -1, -1, -1, -1, 1912, 1470, 3498, 3723]"
629501,AHSNMFN6DUFTNEZAXBVPIYMXWIFQ,B075MYT126,0.0,2017-11-27 22:01:33.258,18413,3777,Video Games,"[Video Games, Nintendo Switch, Accessories, Co...",94.98,2,4.000000,"B00CJ9OTNE,B0118YZG0A,B008M502H6,B003Y70W4U,B0...","[2391, 3100, 2176, 1588, 3161, 2133, 2906, 166..."
328408,AFE47G5MX35LSHZHZXRYEJFMYPUA,B007VYW5K6,0.0,2017-03-23 21:41:18.000,6463,2086,Video Games,"[Video Games, PC]",,1,,"B07YBX8RNF,B0166QDJDQ,B01CHU4IY4,B00Z9LUDX4,B0...","[4278, 3183, 3288, 3038, 4508, 3391, 3403, 368..."
348394,AFJDWGBE3MGULXTO3FUZ5YB6FKDA,B07L5FKGQH,0.0,2017-01-18 15:50:12.000,7246,4048,Video Games,"[Video Games, Xbox One, Games]",49.88,25,4.416667,"B00I6E6SH6,B00O65I2VY,B005GISQQG,B00008KTNW,B0...","[2632, 2859, 1920, 253, 1428, 1053, 584, 732, ..."
577541,AHFDYGJR3SM2D463ZWKGHJPNBKDA,B002BSA2LQ,0.0,2014-01-29 22:50:20.000,16376,1215,Video Games,"[Video Games, Legacy Systems, Xbox Systems, Xb...",31.49,4,5.000000,"B002I0K956,B008CZN458,B0050SXVK8","[-1, -1, -1, -1, -1, -1, -1, 1328, 2146, 1843]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
179669,AE5TQ7DBEX2L5T665M6ZDPGYZ32Q,B01LDUYTYS,0.0,2013-10-05 20:20:52.000,592,3442,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",249.99,1,,B07X1HF3V6,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4237]"
53491,AFDG3CXM4DP7X436YNOKTJHVKJQA,B087NNPYP3,5.0,2018-07-10 21:22:10.594,6351,4342,Video Games,"[Video Games, Nintendo Switch, Consoles]",,3,5.000000,"B002I0H79C,B00503E9FY,B00KVOVBGM,B00SHXKC8M,B0...","[1292, 1807, 2734, 2940, 3402, 2759, 2702, 104..."
368961,AFOUC3S3RH7AXMPZBZHLO4WMLLVA,B004AM65C6,0.0,2018-12-16 13:39:37.174,8062,1651,Video Games,"[Video Games, Legacy Systems, Xbox Systems, Xb...",12.48,2,5.000000,"B002BSA388,B00PIEI1DG,B08MBHYJP4,B071GPJVTQ,B0...","[1216, 2898, 4397, 3643, 3642, 3527, 3423, 367..."
250960,AEPOGF2QMAXO4W3TYP27DCQRITGA,B07X1HF3V6,0.0,2013-05-30 22:53:17.000,3373,4237,Video Games,"[Video Games, Legacy Systems, PlayStation Syst...",34.43,4,3.666667,"B0013OL0BK,B002D2Y3IS,B0044R8X9U,B07VLCRZ21,B0...","[-1, -1, -1, -1, -1, 652, 1240, 1629, 4207, 1707]"


In [27]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
659830,AGFEHP3Y5VKILNRQEJ4ZUTYBAOQQ,B07YBWT3PK,0.0,2022-01-29 17:46:11.047,11485,4270,Video Games,"[Video Games, Xbox One, Games]",8.16,1,,"B007VTVRFA,B0087ACBAW,B00CMQTUSS,B00CQ35C1Q,B0...","[2084, 2117, 2396, 2402, 3066, 2478, 2973, 327..."
660946,AHWKCG443H6VFTPN5FAN2FWCDFPA,B013OW09WY,0.0,2021-09-30 16:00:59.999,19047,3129,Video Games,"[Video Games, Legacy Systems, PlayStation Syst...",13.35,1,,"B009LGO88E,B01MG8P418,B01N7TOICZ,B079M157ZW,B0...","[-1, -1, -1, -1, 2241, 3491, 3541, 3852, 4197,..."
659625,AG6WZATM27WOFLQY4435O3P52SWA,B00001X50M,0.0,2021-11-23 19:07:31.054,10547,36,Video Games,"[Video Games, Legacy Systems, PlayStation Syst...",68.18,2,5.0,"B0793GSKLV,B00XY2UYUW,B07YBXFF5C,B077GG9D5D,B0...","[3844, 3017, 4284, 3813, 3345, 2376, 4629, 352..."
658449,AEGPO7K4HPE6TYTAEOPLPQKTREJQ,B01579EXNS,0.0,2022-03-04 05:39:05.877,1902,3165,Video Games,"[Video Games, PlayStation Digital Content, Games]",,1,,"B009AP2518,B00MUY0OFU,B0C5K4M7WJ,B0716CXJ1R,B0...","[-1, -1, -1, -1, -1, 2220, 2810, 4614, 3626, 4..."
658668,AEQV7EWRJB7UIQVS4CI5PRQ7PJIA,B01N22XPHR,0.0,2022-05-14 19:46:53.469,3572,3519,Computers,"[Video Games, PlayStation 4, Accessories]",10.99,2,5.0,"B00OR7OH88,B01LPZM7VI,B016XBGWAQ,B01NCVU39G,B0...","[-1, -1, -1, -1, 2878, 3448, 3193, 3555, 3865,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
164643,AFL2OB53OGEIQCA4CMFCTQV3FJJA,B00BCX2AZW,5.0,2022-01-13 15:10:43.458,7484,2332,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",54.35,1,,"B08CL92HPG,B01N2V26CN,B002I0K3CK,B00DC7G2W8,B0...","[-1, -1, -1, -1, -1, 4367, 3520, 1316, 2436, 2..."
658223,AE4IFQLINUBOZBP7RMCTICSULMGQ,B0099OUMAQ,0.0,2022-04-24 04:14:48.945,406,2211,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",35.0,3,5.0,"B002EQFXZA,B008OSDEGM,B001EYUQMI,B000GHG9V6,B0...","[1249, 2183, 918, 480, 1589, 1284, 960, 741, 2..."
660589,AHGJSV7BBXT2LQX5PQQ4BICQC4NQ,B00004U1R3,0.0,2021-11-24 19:29:36.067,16565,103,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",108.14,1,,"B0050SX9I2,B00BGA9X9W,B019H5II8Y,B01LRLJV28,B0...","[-1, -1, -1, -1, 1838, 2339, 3238, 3449, 3057,..."
660178,AGTPSXXK4B2NSMTIDJCOIXELXOIA,B07STWQ38X,0.0,2022-05-04 01:26:34.832,13677,4170,Video Games,"[Video Games, PlayStation 4, Games]",17.99,1,,"B07N11TKK9,B08JHYYTMT,B017QU5KLK,B002BSC54I,B0...","[4070, 4387, 3203, 1220, 3705, 4440, 3053, 406..."


# Checks

In [28]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
assert (
    check_df[args.rating_col].gt(0).sum() * (args.neg_to_pos_ratio + 1) == check_df.shape[0]
), "Unexpected number of pos and neg samples"

[32m2024-10-27 15:08:27.508[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AERAX4VNX4JDFBK6BOH6NQ57U4BA...[0m


In [29]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert (
    train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5
), f"Item {item} does not appear much in training data"

[32m2024-10-27 15:08:27.538[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B094YHB1QK...[0m


## Random eye-ball

In [30]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
257811,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B01IFE1XV2,0.0,2011-01-13 12:32:26,3634,3403,Video Games,"[Video Games, Xbox One, Consoles]",358.99,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
257810,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B007ZLLV4G,0.0,2011-01-13 12:32:26,3634,2099,,"[Video Games, Legacy Systems, PlayStation Syst...",60.0,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
30850,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B003BFW4OG,5.0,2011-01-13 12:32:26,3634,1447,,"[Video Games, Legacy Systems, PlayStation Syst...",6.72,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
257812,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B06XPX5XLY,0.0,2011-01-13 12:32:26,3634,3579,Tools & Home Improvement,"[Video Games, Legacy Systems, Nintendo Systems...",,1,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
257813,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B07M8ZXQR6,0.0,2011-05-09 12:52:42,3634,4056,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",33.77,1,,B003BFW4OG,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1447]"
257814,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B07QPVTW6R,0.0,2011-05-09 12:52:42,3634,4139,Video Games,"[Video Games, PC, Accessories, Headsets]",129.98,1,,B003BFW4OG,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1447]"
30851,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B001G7PS4Y,3.0,2011-05-09 12:52:42,3634,1078,Video Games,"[Video Games, Legacy Systems, Xbox Systems, Xb...",3.75,1,,B003BFW4OG,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1447]"
257815,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B07L3D7C21,0.0,2011-05-09 12:52:42,3634,4043,Video Games,"[Video Games, Online Game Services, Nintendo e...",19.99,1,,B003BFW4OG,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1447]"
257816,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B00EU2YGV8,0.0,2011-05-09 13:04:39,3634,2504,Video Games,"[Video Games, Legacy Systems, PlayStation Syst...",20.38,2,3.0,"B003BFW4OG,B001G7PS4Y","[-1, -1, -1, -1, -1, -1, -1, -1, 1447, 1078]"
257817,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B001ELJEGU,0.0,2011-05-09 13:04:39,3634,790,Video Games,"[Video Games, Legacy Systems, Xbox Systems, Xb...",17.0,2,3.0,"B003BFW4OG,B001G7PS4Y","[-1, -1, -1, -1, -1, -1, -1, -1, 1447, 1078]"


In [31]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,categories,price,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,item_sequence
658688,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B000OLXX86,0.0,2022-01-26 17:48:01.643,3634,543,Video Games,"[Video Games, Legacy Systems, Xbox Systems, Xb...",19.75,1,,"B003BFW4OG,B001G7PS4Y,B001BP4JY6,B001QCWSII,B0...","[-1, -1, -1, -1, -1, 1447, 1078, 690, 1140, 1124]"
164476,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B094YHB1QK,5.0,2022-01-26 17:48:01.643,3634,4455,Video Games,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,1,,"B003BFW4OG,B001G7PS4Y,B001BP4JY6,B001QCWSII,B0...","[-1, -1, -1, -1, -1, 1447, 1078, 690, 1140, 1124]"
658690,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B01MRN26ES,0.0,2022-01-26 17:48:01.643,3634,3497,Computers,"[Video Games, Mac, Accessories, Gaming Keyboards]",14.99,1,,"B003BFW4OG,B001G7PS4Y,B001BP4JY6,B001QCWSII,B0...","[-1, -1, -1, -1, -1, 1447, 1078, 690, 1140, 1124]"
658689,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B01JJFZ3LC,0.0,2022-01-26 17:48:01.643,3634,3413,Video Games,"[Video Games, Legacy Systems, Nintendo Systems...",168.98,1,,"B003BFW4OG,B001G7PS4Y,B001BP4JY6,B001QCWSII,B0...","[-1, -1, -1, -1, -1, 1447, 1078, 690, 1140, 1124]"
658692,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B00RU75I2G,0.0,2022-01-26 17:56:36.587,3634,2927,Video Games,"[Video Games, Xbox One, Games]",22.94,2,5.0,"B003BFW4OG,B001G7PS4Y,B001BP4JY6,B001QCWSII,B0...","[-1, -1, -1, -1, 1447, 1078, 690, 1140, 1124, ..."
164477,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B07VFYBMTS,5.0,2022-01-26 17:56:36.587,3634,4202,Computers,"[Video Games, PC, Accessories, Gaming Mice]",45.99,2,5.0,"B003BFW4OG,B001G7PS4Y,B001BP4JY6,B001QCWSII,B0...","[-1, -1, -1, -1, 1447, 1078, 690, 1140, 1124, ..."
658691,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B07YBX6T95,0.0,2022-01-26 17:56:36.587,3634,4273,Video Games,"[Video Games, Legacy Systems, Xbox Systems, Xb...",27.99,2,5.0,"B003BFW4OG,B001G7PS4Y,B001BP4JY6,B001QCWSII,B0...","[-1, -1, -1, -1, 1447, 1078, 690, 1140, 1124, ..."
658693,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B07DK1XZ4M,0.0,2022-01-26 17:56:36.587,3634,3937,Video Games,"[Video Games, Nintendo Switch, Games]",29.76,2,5.0,"B003BFW4OG,B001G7PS4Y,B001BP4JY6,B001QCWSII,B0...","[-1, -1, -1, -1, 1447, 1078, 690, 1140, 1124, ..."
658696,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B00J5C3Z10,0.0,2022-01-26 17:59:37.364,3634,2671,Video Games,"[Video Games, PlayStation 4, Games]",17.93,3,5.0,"B003BFW4OG,B001G7PS4Y,B001BP4JY6,B001QCWSII,B0...","[-1, -1, -1, 1447, 1078, 690, 1140, 1124, 4455..."
658694,AERAX4VNX4JDFBK6BOH6NQ57U4BA,B00Z9TMBOU,0.0,2022-01-26 17:59:37.364,3634,3060,Video Games,"[Video Games, Xbox One, Games]",9.99,3,5.0,"B003BFW4OG,B001G7PS4Y,B001BP4JY6,B001QCWSII,B0...","[-1, -1, -1, 1447, 1078, 690, 1140, 1124, 4455..."


# Persist

In [32]:
full_features_df.to_parquet(
    "../data/full_features_neg_sampling_df.parquet", index=False
)
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)