# Negative sampling

More label data can tremendously help model to learn. In this notebook we would prepare negative samples for each user based on the unseen items. The unseen items are sampled based on how frequent they appear in the training dataset. This is an intentional choice to make the negative sample harder, hence potentially more useful. This would also force the model learn relevant patterns about user behaviors rather than biased by popularity.

# Set up

In [1]:
import os
import sys

import pandas as pd
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.negative_sampling import add_features_to_neg_df, generate_negative_samples

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-sample-rich-data"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-sample-rich-data",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/000-sample-rich-data",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

In [4]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df)
neg_df = add_features_to_neg_df(df, neg_df, "user_indice", "timestamp")

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
neg_df.sort_values(["user_indice", args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,105,0,1
1,1,106,0,2
2,1,104,0,4
3,2,105,0,1
4,2,103,0,2
5,3,103,0,1
6,3,102,0,5


# Load data

In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [8]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-09-26 22:57:03.399[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=np.int64(1628641464793)[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
0,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,B00001IVB4,5.0,942965209000,19876,2823,Video Games,Sim Theme Park - PC,[],"[Video Games, PC, Games]",35.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,B00002NDRY,3.0,947856017000,3530,1515,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,12674,123,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B001EYUWY0,5.0,952016747000,2994,1085,Video Games,Unreal Tournament - PlayStation 2,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, PlayStation Syst...",41.53,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B00001KUII,5.0,952143204000,2994,3490,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
944,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,B08F4C6HCD,5.0,1657816667680,6031,803,Video Games,Legend of Zelda Link's Awakening - Nintendo Sw...,"[“Castaway, you should know the truth!” As Lin...","[Video Games, Nintendo Switch, Games]",59.88,"[3629, 1663, 1826, 2705, 796, 3933, 2950, 3060..."
945,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,B00DBDPOZ4,5.0,1657855227062,8270,4310,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,"[-1, -1, -1, -1, -1, 1300, 1525, 2578, 752, 3993]"
946,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,6994,1259,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, 1505, 2557, 173, 4293, 1348, 3358..."
947,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B00ZJBSBD8,5.0,1657945454164,10344,2913,Video Games,Trackmania Turbo-Nla,[Step into the wild car fantasy world of Track...,"[Video Games, PlayStation 4, Games]",13.68,"[3687, 830, 399, 955, 2795, 4184, 4528, 353, 1..."


In [10]:
features = ["item_sequence", "user_id"]

neg_df = generate_negative_samples(
    full_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    neg_label=0,
    seed=args.random_seed,
)
neg_ts_df = add_features_to_neg_df(
    full_df, neg_df, "user_indice", args.timestamp_col, features
)
neg_ts_df

Generating Negative Samples:   0%|          | 0/20366 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,1152,0,1536615707592,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGVPBI5JZBJH46V6QFGUVG2F7IUA
1,0,196,0,1543448552368,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGVPBI5JZBJH46V6QFGUVG2F7IUA
2,0,3204,0,1543449125878,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGVPBI5JZBJH46V6QFGUVG2F7IUA
3,0,181,0,1543449320788,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 262...",AGVPBI5JZBJH46V6QFGUVG2F7IUA
4,0,516,0,1543450139687,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2622.0, 3...",AGVPBI5JZBJH46V6QFGUVG2F7IUA
...,...,...,...,...,...,...
170989,20365,308,0,1404732383000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGSVFZYLKH64IY2AOM4E634RTHA
170990,20365,1171,0,1404732529000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGSVFZYLKH64IY2AOM4E634RTHA
170991,20365,4552,0,1404732586000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGSVFZYLKH64IY2AOM4E634RTHA
170992,20365,759,0,1405079984000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 417...",AHGSVFZYLKH64IY2AOM4E634RTHA


# Join with features

Populate the feature values for the newly created negative samples

In [11]:
# Item features

not_item_feature_cols = (
    args.user_col,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    *features,
)
item_features = [col for col in full_df.columns if col not in not_item_feature_cols]
item_features

['parent_asin', 'main_category', 'title', 'description', 'categories', 'price']

In [12]:
neg_ts_df = pd.merge(
    neg_ts_df,
    full_df[["item_indice", *item_features]].drop_duplicates(subset=["item_indice"]),
    how="left",
    on=["item_indice"],
    validate="m:1",
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin,main_category,title,description,categories,price
0,0,1152,0,1536615707592,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGVPBI5JZBJH46V6QFGUVG2F7IUA,B001EYUQDW,Video Games,Borderlands Ultimate Edition - PS3 [Digital Code],"[Product Description, Gearbox Software has dev...","[Video Games, Legacy Systems, PlayStation Syst...",
1,0,196,0,1543448552368,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGVPBI5JZBJH46V6QFGUVG2F7IUA,B00VUCEXNW,Video Games,LEGO Dimensions Starter Pack - Xbox One,[When a mysterious and powerful vortex suddenl...,"[Video Games, Xbox One, Games]",199.99
2,0,3204,0,1543449125878,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGVPBI5JZBJH46V6QFGUVG2F7IUA,B00FATRKLI,Video Games,Xbox 360 E 250GB Kinect Holiday Value Bundle,[The Xbox 360 250GB Kinect Holiday Value Bundl...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",184.99
3,0,181,0,1543449320788,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 262...",AGVPBI5JZBJH46V6QFGUVG2F7IUA,B09GM4283G,Video Games,PlayStation PULSE 3D Wireless Headset – Midnig...,[Ignite your gaming nights with the ultra-slee...,"[Video Games, PlayStation 5, Accessories, Gami...",99.0
4,0,516,0,1543450139687,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2622.0, 3...",AGVPBI5JZBJH46V6QFGUVG2F7IUA,B0049P1ZYW,Video Games,The Sims 3: Late Night - PC/Mac,"[Product Description, Get your Sims an all-acc...","[Video Games, PC, Games]",31.88
...,...,...,...,...,...,...,...,...,...,...,...,...
170989,20365,308,0,1404732383000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGSVFZYLKH64IY2AOM4E634RTHA,B001G7PSIU,Video Games,Just Cause 2 - Xbox 360 Digital Code,[Dive into an adrenaline-fuelled free-roaming ...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",
170990,20365,1171,0,1404732529000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGSVFZYLKH64IY2AOM4E634RTHA,B0087LXZRE,Video Games,WWE '13,"[It takes blood, sweat, and tears to become th...","[Video Games, Legacy Systems, PlayStation Syst...",70.0
170991,20365,4552,0,1404732586000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGSVFZYLKH64IY2AOM4E634RTHA,B0049H2W74,Video Games,Rapala Pro Bass Fishing 2010 - Playstation 3,[Rapala Pro Bass Fishing 2010 pits players aga...,"[Video Games, Legacy Systems, PlayStation Syst...",70.0
170992,20365,759,0,1405079984000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 417...",AHGSVFZYLKH64IY2AOM4E634RTHA,B002BG1PXM,Video Games,Star Wars the Clone Wars: Republic Heroes - Xb...,"[Product Description, Star Wars The Clone Wars...","[Video Games, Legacy Systems, Xbox Systems, Xb...",33.45


In [13]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]["item_indice"]
assert (
    len(set(neg_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_df.loc[lambda df: df[args.item_col].eq(item)][
    "item_indice"
]
assert (
    len(set(original_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2024-09-26 22:57:12.834[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B001EYUNUS...[0m


# Concating positive data with negative samples

In [14]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(
    frac=1, replace=False, random_state=args.random_seed
)

In [15]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AETXPRHIFDJPZH62JOVAKYPN4IJA,B07HC4ZD3B,0.0,1572818050904,13294,1565,Video Games,Poké Ball Plus Charge Stand Officially License...,[Create a Pokemon center at home with HORI's N...,"[Video Games, Nintendo Switch, Accessories, Ba...",189.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
65458,AE4UCYLS3NXHINCFZW6FFPT7VX5A,B003S9WQAM,0.0,1289338003000,7769,1208,Video Games,EA Sports Active 2 - Playstation 3,"[Product Description, EA SPORTS Active 2 with ...","[Video Games, Legacy Systems, PlayStation Syst...",41.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
79874,AHHRPUW2QJC7F2OGM3ACAHW4PXVQ,B00B1OYBBS,0.0,1420679625000,9498,4137,Video Games,Enemy Front - Xbox 360,[Against the visceral backdrop of breathtaking...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
102744,AF6GXCYIAWC6G33YPOJ5IBGD6RKQ,B004C43FH0,0.0,1427907775000,12279,989,Video Games,Rollercoaster Tycoon 2: Triple Thrill Pack [Do...,"[From the Manufacturer, This compilation of Ro...","[Video Games, PC, Games]",9.99,"[1042.0, 2148.0, 4637.0, 54.0, 2854.0, 4479.0,..."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,8779,472,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3711.0, 4..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AHNTZVONMCC55XXEESJJ2I3OMRMQ,B005IU8A5O,0.0,1445376835000,9511,2439,All Electronics,JINHEZO JINHEZO-CS-370 for Xbox 360 Kinect Sen...,[],"[Video Games, Legacy Systems, Xbox Systems, Xb...",7.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, 1081.0, 115.0, ..."
46064,AFC4IA7LI5GB43LNPRFF37H2FYFQ,B00IAVDQCK,0.0,1483041968000,5491,2382,Video Games,Xbox One Stereo Headset,"[Surround your senses, Immerse yourself in ric...","[Video Games, Xbox One, Accessories, Headsets]",18.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 151..."
152474,AGJD6XPC33AX7OVG4GKFHLP5KMGA,B001EYUSC6,0.0,1489418014000,18182,1049,,FIFA Soccer 09 - Sony PSP,"[Product description, Play professional soccer...","[Video Games, Legacy Systems, PlayStation Syst...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 431..."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,819,1720,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 173..."


In [16]:
key_cols = [
    args.user_col,
    args.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    args.rating_col,
    args.timestamp_col,
]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [17]:
val_timestamp

np.int64(1628641464793)

# Split back train test

In [18]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [19]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AETXPRHIFDJPZH62JOVAKYPN4IJA,B07HC4ZD3B,0.0,1572818050904,13294,1565,Video Games,Poké Ball Plus Charge Stand Officially License...,[Create a Pokemon center at home with HORI's N...,"[Video Games, Nintendo Switch, Accessories, Ba...",189.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
65458,AE4UCYLS3NXHINCFZW6FFPT7VX5A,B003S9WQAM,0.0,1289338003000,7769,1208,Video Games,EA Sports Active 2 - Playstation 3,"[Product Description, EA SPORTS Active 2 with ...","[Video Games, Legacy Systems, PlayStation Syst...",41.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
79874,AHHRPUW2QJC7F2OGM3ACAHW4PXVQ,B00B1OYBBS,0.0,1420679625000,9498,4137,Video Games,Enemy Front - Xbox 360,[Against the visceral backdrop of breathtaking...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
102744,AF6GXCYIAWC6G33YPOJ5IBGD6RKQ,B004C43FH0,0.0,1427907775000,12279,989,Video Games,Rollercoaster Tycoon 2: Triple Thrill Pack [Do...,"[From the Manufacturer, This compilation of Ro...","[Video Games, PC, Games]",9.99,"[1042.0, 2148.0, 4637.0, 54.0, 2854.0, 4479.0,..."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,8779,472,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3711.0, 4..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AHNTZVONMCC55XXEESJJ2I3OMRMQ,B005IU8A5O,0.0,1445376835000,9511,2439,All Electronics,JINHEZO JINHEZO-CS-370 for Xbox 360 Kinect Sen...,[],"[Video Games, Legacy Systems, Xbox Systems, Xb...",7.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, 1081.0, 115.0, ..."
46064,AFC4IA7LI5GB43LNPRFF37H2FYFQ,B00IAVDQCK,0.0,1483041968000,5491,2382,Video Games,Xbox One Stereo Headset,"[Surround your senses, Immerse yourself in ric...","[Video Games, Xbox One, Accessories, Headsets]",18.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 151..."
152474,AGJD6XPC33AX7OVG4GKFHLP5KMGA,B001EYUSC6,0.0,1489418014000,18182,1049,,FIFA Soccer 09 - Sony PSP,"[Product description, Play professional soccer...","[Video Games, Legacy Systems, PlayStation Syst...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 431..."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,819,1720,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 173..."


In [20]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
88244,AEHCPMCLIMWHW7TEFRCOZN4MOFGA,B00V5V3E38,0.0,1633967679730,10525,4119,Video Games,Legend of Kay Anniversary - Nintendo Wii U,"[10 years after its initial release, Legend of...","[Video Games, Legacy Systems, Nintendo Systems...",,"[-1, -1, -1, -1, 3766, 4102, 2872, 2810, 502, ..."
98196,AGFPKXT34G5FGWARKXZC4GJTJQUQ,B01GY3651O,0.0,1630491281842,11724,2977,Video Games,XCOM 2 Deluxe Edition [Online Game Code],[The XCOM 2 Digital Deluxe Edition includes th...,"[Video Games, PC, Games]",,"[594, 4578, 3289, 4664, 1057, 3845, 3010, 2691..."
62176,AH4AOFTTDPHPAFAAVFMAF25H2LIQ,B09B14PJCG,0.0,1641748747823,7401,4682,Video Games,A Plague Tale: Innocence (XB1) - Xbox One,[Follow the grim tale of young Amicia and her ...,"[Video Games, Xbox One, Games]",59.99,"[4064, 24, 1441, 1805, 2619, 2254, 2274, 1454,..."
42605,AEXTTZIJDNXIXQZFR5O7IJRXO3GA,B081243BT6,0.0,1637074638494,5074,3104,Cell Phones & Accessories,Orzly Carrying case for Nintendo Switch OLED a...,[],"[Video Games, Nintendo Switch, Accessories, Ca...",29.99,"[-1, -1, -1, -1, -1, 811, 2540, 313, 4394, 1846]"
74,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B0BLFYF8K2,4.0,1630263342566,1616,4127,Computers,"Logitech G600 MMO Gaming Mouse, RGB Backlit, 2...","[With 20 buttons, the Logitech G600 MMO Gaming...","[Video Games, PC, Accessories, Gaming Mice]",37.99,"[1459, 1860, 3264, 569, 2143, 773, 4483, 296, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
13997,AFPSHZKKUL2YDGIDBQQUNRGE5MXQ,B07DD7QTBM,0.0,1641121394999,1672,2700,,Just Dance 2019 - Xbox One Standard Edition,"[Dance to your own beat with Just Dance 2019, ...","[Video Games, Xbox One, Games]",12.3,"[-1, -1, -1, 2830, 1342, 1294, 1749, 2558, 385..."
815,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,49,1887,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,"[-1, 3179, 1489, 2225, 3399, 3142, 4247, 3801,..."
280,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,6619,2246,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,"[-1, -1, -1, -1, -1, 2780, 3158, 130, 1164, 1030]"
156523,AEBTSECUK7ZEECNSRHQLMKO3E5VA,B002BSA388,0.0,1642567970979,18651,3724,Video Games,Super Mario Galaxy 2,"[Product Description, Launch into a new univer...","[Video Games, Legacy Systems, Nintendo Systems...",80.82,"[-1, -1, -1, -1, 2135, 2729, 14, 3639, 3118, 955]"


# Checks

In [21]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
assert (
    check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2
), "Number of pos and neg samples are not equal"

[32m2024-09-26 22:57:12.970[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AFQZ627T7A364657B4JO43XRW2TQ...[0m


In [22]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert (
    train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5
), f"Item {item} does not appear much in training data"

[32m2024-09-26 22:57:12.984[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B016P09VFS...[0m


## Random eye-ball

In [23]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
44911,AFQZ627T7A364657B4JO43XRW2TQ,B000SSPH3E,5.0,1388691164000,5891,2359,Video Games,Final Fantasy Tactics: The War of the Lions - ...,[Enter a time where the land Ivalice has been ...,"[Video Games, Legacy Systems, PlayStation Syst...",25.58,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
49491,AFQZ627T7A364657B4JO43XRW2TQ,B003QCIVUU,0.0,1388691164000,5891,2104,Video Games,WipeOut: The Game,"[Product Description, All the Thrills and spil...","[Video Games, Legacy Systems, Nintendo Systems...",24.95,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
44912,AFQZ627T7A364657B4JO43XRW2TQ,B001EYUNU8,5.0,1388691280000,5891,2434,Video Games,The Last Remnant - PC,"[Product Description, Square Enix brings the b...","[Video Games, PC, Games]",79.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
49492,AFQZ627T7A364657B4JO43XRW2TQ,B0044XU27A,0.0,1388691280000,5891,2667,Video Games,Kingdoms of Amalur: Reckoning - Xbox 360,"[Product Description, The minds of, New York T...","[Video Games, Legacy Systems, Xbox Systems, Xb...",14.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
44913,AFQZ627T7A364657B4JO43XRW2TQ,B004G5YI3U,1.0,1388691515000,5891,4242,Video Games,Mass Effect 3 Digital Deluxe Version [Download],[],"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
49493,AFQZ627T7A364657B4JO43XRW2TQ,B000A4T1U6,0.0,1388691515000,5891,637,Video Games,Castlevania II: Simon's Quest,[Castlevania was a cakewalk compared to this b...,"[Video Games, Legacy Systems, Nintendo Systems...",20.71,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
169402,AFQZ627T7A364657B4JO43XRW2TQ,B0BTM9LJM4,5.0,1623204618875,5891,3686,Video Games,HORI Nintendo Switch Split Pad Pro (Pokemon Le...,[Become a true Pokemon trainer legend with the...,"[Video Games, Nintendo Switch, Accessories, Co...",59.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 235..."
49494,AFQZ627T7A364657B4JO43XRW2TQ,B003L20IPQ,0.0,1623204618875,5891,3192,Video Games,The Sims 3 - Nintendo DS,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, Nintendo Systems...",49.9,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 235..."
49495,AFQZ627T7A364657B4JO43XRW2TQ,B072BX7H6Q,0.0,1623205241488,5891,1037,Video Games,F1 2017 Special Edition - PlayStation 4,"[Win the 2017 World Championship, break every ...","[Video Games, PlayStation 4, Games]",47.45,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2359.0, 2..."
169403,AFQZ627T7A364657B4JO43XRW2TQ,B01N3ASPNV,1.0,1623205241488,5891,4057,All Electronics,amFilm Tempered Glass Screen Protector for Nin...,[],"[Video Games, Nintendo Switch, Accessories, Fa...",8.91,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2359.0, 2..."


In [24]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
848,AFQZ627T7A364657B4JO43XRW2TQ,B016P09VFS,4.0,1654198460425,5891,2207,Video Games,KontrolFreek FPS Freek Galaxy Purple for PlayS...,[],"[Video Games, PlayStation 4, Accessories, Thum...",16.99,"[-1, -1, -1, -1, -1, 2359, 2434, 4242, 3686, 4..."
49496,AFQZ627T7A364657B4JO43XRW2TQ,B00KXAI6MQ,0.0,1654198460425,5891,4471,Video Games,Disney INFINITY: Marvel Super Heroes (2.0 Edit...,[],"[Video Games, PlayStation 4, Games]",111.96,"[-1, -1, -1, -1, -1, 2359, 2434, 4242, 3686, 4..."
49497,AFQZ627T7A364657B4JO43XRW2TQ,B01ARLTPMA,0.0,1654199171505,5891,945,All Electronics,YCCTEAM Power Supply Brick for Xbox One with P...,"[Highlight:, * YCCTEAM ac adapter for Xbox One...","[Video Games, Xbox One, Accessories, Cables & ...",26.99,"[-1, -1, -1, -1, 2359, 2434, 4242, 3686, 4057,..."
849,AFQZ627T7A364657B4JO43XRW2TQ,B08DFB488B,3.0,1654199171505,5891,2317,Video Games,Xbox Core Wireless Controller – Shock Blue,[Experience the modernized design of the Xbox ...,[],43.99,"[-1, -1, -1, -1, 2359, 2434, 4242, 3686, 4057,..."


# Persist

In [25]:
full_df.to_parquet("../data/full_features_neg_sampling_df.parquet", index=False)

In [26]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)