# Negative sampling

More label data can tremendously help model to learn. In this notebook we would prepare negative samples for each user based on the unseen items. The unseen items are sampled based on how frequent they appear in the training dataset. This is an intentional choice to make the negative sample harder, hence potentially more useful. This would also force the model learn relevant patterns about user behaviors rather than biased by popularity.

# Set up

In [1]:
import os
import sys

import pandas as pd
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.negative_sampling import add_features_to_neg_df, generate_negative_samples

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-sample-rich-data"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-sample-rich-data",
  "notebook_persist_dp": "/home/jupyter/frostmourne/reco-algo/notebooks/data/000-sample-rich-data",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

In [4]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df)
neg_df = add_features_to_neg_df(df, neg_df, "user_indice", "timestamp")

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
neg_df.sort_values(["user_indice", args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,106,0,1
1,1,105,0,2
2,1,104,0,4
3,2,103,0,1
4,2,102,0,2
5,3,104,0,1
6,3,101,0,5


# Load data

In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [8]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-09-25 02:27:41.567[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=np.int64(1628641464793)[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
0,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,B00001IVB4,5.0,942965209000,14867,1019,Video Games,Sim Theme Park - PC,[],"[Video Games, PC, Games]",35.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,B00002NDRY,3.0,947856017000,1902,739,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,20295,4093,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B001EYUWY0,5.0,952016747000,17506,3579,Video Games,Unreal Tournament - PlayStation 2,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, PlayStation Syst...",41.53,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B00001KUII,5.0,952143204000,17506,3497,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
944,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,B08F4C6HCD,5.0,1657816667680,15474,2647,Video Games,Legend of Zelda Link's Awakening - Nintendo Sw...,"[“Castaway, you should know the truth!” As Lin...","[Video Games, Nintendo Switch, Games]",59.88,"[1424, 1041, 2933, 3347, 3152, 801, 2420, 1680..."
945,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,B00DBDPOZ4,5.0,1657855227062,19776,1150,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,"[-1, -1, -1, -1, -1, 2006, 171, 3263, 1825, 2759]"
946,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,6226,4672,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, 3382, 1427, 3577, 751, 989, 1482,..."
947,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B00ZJBSBD8,5.0,1657945454164,9616,2395,Video Games,Trackmania Turbo-Nla,[Step into the wild car fantasy world of Track...,"[Video Games, PlayStation 4, Games]",13.68,"[2505, 2993, 1367, 221, 4253, 4625, 3150, 4257..."


In [10]:
features = ["item_sequence", "user_id"]

neg_df = generate_negative_samples(
    full_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    neg_label=0,
    seed=args.random_seed,
)
neg_ts_df = add_features_to_neg_df(
    full_df, neg_df, "user_indice", args.timestamp_col, features
)
neg_ts_df

Generating Negative Samples:   0%|          | 0/20366 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,1206,0,1505618588969,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFYDI74M267F3IU7DXCWA5UP72GA
1,0,187,0,1505618621571,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFYDI74M267F3IU7DXCWA5UP72GA
2,0,3164,0,1527487079128,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFYDI74M267F3IU7DXCWA5UP72GA
3,0,172,0,1527487171205,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 185...",AFYDI74M267F3IU7DXCWA5UP72GA
4,0,552,0,1527487410172,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1856.0, 2...",AFYDI74M267F3IU7DXCWA5UP72GA
...,...,...,...,...,...,...
170989,20365,75,0,1473283783000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFIHCGVZXRJDGSV7AKD376YWMHDA
170990,20365,1379,0,1536193194580,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFIHCGVZXRJDGSV7AKD376YWMHDA
170991,20365,1893,0,1537895803134,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFIHCGVZXRJDGSV7AKD376YWMHDA
170992,20365,3944,0,1617747229118,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 375...",AFIHCGVZXRJDGSV7AKD376YWMHDA


# Join with features

Populate the feature values for the newly created negative samples

In [11]:
# Item features

not_item_feature_cols = (
    args.user_col,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    *features,
)
item_features = [col for col in full_df.columns if col not in not_item_feature_cols]
item_features

['parent_asin', 'main_category', 'title', 'description', 'categories', 'price']

In [12]:
neg_ts_df = pd.merge(
    neg_ts_df,
    full_df[["item_indice", *item_features]].drop_duplicates(subset=["item_indice"]),
    how="left",
    on=["item_indice"],
    validate="m:1",
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin,main_category,title,description,categories,price
0,0,1206,0,1505618588969,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFYDI74M267F3IU7DXCWA5UP72GA,B002I090AG,Video Games,Nintendo 3DS Aqua Blue,"[Product Description, Nintendo 3DS - Aqua Blue...","[Video Games, Legacy Systems, Nintendo Systems...",198.65
1,0,187,0,1505618621571,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFYDI74M267F3IU7DXCWA5UP72GA,B003VAK16O,Computers,Logitech Gaming Keyboard G510,[GamePanel LCDCustom-color backlighting18 full...,"[Video Games, PC, Accessories, Gaming Keyboards]",
2,0,3164,0,1527487079128,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFYDI74M267F3IU7DXCWA5UP72GA,B013OW09WY,Video Games,Skylanders SuperChargers: Drivers Big Bubble P...,[DRIVE EVIL CRAZY! Skylands is in peril! Kaos ...,"[Video Games, Legacy Systems, PlayStation Syst...",13.35
3,0,172,0,1527487171205,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 185...",AFYDI74M267F3IU7DXCWA5UP72GA,B07SM7G9CN,Video Games,Donkey Kong Country: Tropical Freeze - Nintend...,[Barrel-blast into a critically acclaimed Donk...,"[Video Games, Nintendo Switch, Games]",52.49
4,0,552,0,1527487410172,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1856.0, 2...",AFYDI74M267F3IU7DXCWA5UP72GA,B000099T1Y,Video Games,Jak II - PlayStation 2,[Friendship. Betrayal. Revenge. Prepare for an...,"[Video Games, Legacy Systems, PlayStation Syst...",9.99
...,...,...,...,...,...,...,...,...,...,...,...,...
170989,20365,75,0,1473283783000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFIHCGVZXRJDGSV7AKD376YWMHDA,B017QU5KLK,Video Games,Overwatch - Game of the Year Edition - Xbox One,[],"[Video Games, Xbox One, Games]",20.0
170990,20365,1379,0,1536193194580,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFIHCGVZXRJDGSV7AKD376YWMHDA,B002I096AA,Video Games,Nintendo 3DS - Cosmo Black,"[Product Description, Play games in 3D without...","[Video Games, Legacy Systems, Nintendo Systems...",249.79
170991,20365,1893,0,1537895803134,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFIHCGVZXRJDGSV7AKD376YWMHDA,B074G34RY2,Computers,SteelSeries Sensei Laser Gaming Mouse [RAW] Fr...,"[Product Description, The SteelSeries Sensei R...","[Video Games, PC, Accessories, Gaming Mice]",69.99
170992,20365,3944,0,1617747229118,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 375...",AFIHCGVZXRJDGSV7AKD376YWMHDA,B001EYUW8Q,Video Games,TNA Impact! - PlayStation 2,"[Product Description, TNA Wrestling is the exc...","[Video Games, Legacy Systems, PlayStation Syst...",27.43


In [13]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]["item_indice"]
assert (
    len(set(neg_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_df.loc[lambda df: df[args.item_col].eq(item)][
    "item_indice"
]
assert (
    len(set(original_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2024-09-25 02:28:10.460[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B00J22635I...[0m


# Concating positive data with negative samples

In [14]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(
    frac=1, replace=False, random_state=args.random_seed
)

In [15]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AEIE6SAJAELLZA4BLSNPOYLEWDXA,B013P0HW7K,0.0,1455882952000,13174,264,Computers,Razer Orbweaver Chroma Gaming Keypad: Mechanic...,[The Razer orb Weaver is the world's first mec...,"[Video Games, PC, Accessories, Gaming Keyboards]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 688..."
65458,AEMZFONWV7KHCF2CPIVKKNCKHLLQ,B0086VPUHI,0.0,1416423128000,7749,3275,Video Games,Grand Theft Auto V: Premium Edition - Xbox One...,[The Grand Theft Auto V: Premium Edition inclu...,"[Video Games, Xbox One, Games]",15.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
79874,AFH25PNHGY6BUBZBK4OAL4QJBYGQ,B00YJJ0OQS,0.0,1468541348000,9440,2318,Video Games,Xbox One Wireless Controller [Without Bluetooth],"[Experience the unique intensity, precision an...","[Video Games, Xbox One, Accessories, Controlle...",71.77,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
102744,AGUN3QQ3WECXH6D3FYKKFI5HRNNA,B00BN5T30E,0.0,1509679302842,12165,4438,Video Games,Assassin's Creed IV Black Flag - PC,"[From the Manufacturer, Assassin's Creed, ®, I...","[Video Games, PC, Games]",43.89,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,18519,3311,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2385.0, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AGYL3DDXAR56G6XEAQVCSQCGZSXA,B001ELJFGE,0.0,1033148855000,9450,2212,Video Games,Army of Two - Playstation 3,"[Product description, Delivering a groundbreak...","[Video Games, Legacy Systems, PlayStation Syst...",35.9,"[920.0, 739.0, 677.0, 4093.0, 3808.0, 1745.0, ..."
46064,AEQDXMB264KG4OZUZL7NAJFZGADA,B001TOQ8NO,0.0,1466618460000,5452,3466,Video Games,Wii MotionPlus,[],"[Video Games, Legacy Systems, Nintendo Systems...",46.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3138.0, 3..."
152474,AEYD3JIWG72NBXR4BXSEZX3ZMA6Q,B06XHMPPLH,0.0,1417535860000,18134,4019,Video Games,NieR: Automata - Playstation 4,[],"[Video Games, PlayStation 4, Games]",25.93,"[506.0, 3116.0, 673.0, 1272.0, 4315.0, 1973.0,..."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,6010,928,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 237..."


In [16]:
key_cols = [
    args.user_col,
    args.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    args.rating_col,
    args.timestamp_col,
]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [17]:
val_timestamp

np.int64(1628641464793)

# Split back train test

In [18]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [19]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AEIE6SAJAELLZA4BLSNPOYLEWDXA,B013P0HW7K,0.0,1455882952000,13174,264,Computers,Razer Orbweaver Chroma Gaming Keypad: Mechanic...,[The Razer orb Weaver is the world's first mec...,"[Video Games, PC, Accessories, Gaming Keyboards]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 688..."
65458,AEMZFONWV7KHCF2CPIVKKNCKHLLQ,B0086VPUHI,0.0,1416423128000,7749,3275,Video Games,Grand Theft Auto V: Premium Edition - Xbox One...,[The Grand Theft Auto V: Premium Edition inclu...,"[Video Games, Xbox One, Games]",15.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
79874,AFH25PNHGY6BUBZBK4OAL4QJBYGQ,B00YJJ0OQS,0.0,1468541348000,9440,2318,Video Games,Xbox One Wireless Controller [Without Bluetooth],"[Experience the unique intensity, precision an...","[Video Games, Xbox One, Accessories, Controlle...",71.77,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
102744,AGUN3QQ3WECXH6D3FYKKFI5HRNNA,B00BN5T30E,0.0,1509679302842,12165,4438,Video Games,Assassin's Creed IV Black Flag - PC,"[From the Manufacturer, Assassin's Creed, ®, I...","[Video Games, PC, Games]",43.89,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,18519,3311,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2385.0, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AGYL3DDXAR56G6XEAQVCSQCGZSXA,B001ELJFGE,0.0,1033148855000,9450,2212,Video Games,Army of Two - Playstation 3,"[Product description, Delivering a groundbreak...","[Video Games, Legacy Systems, PlayStation Syst...",35.9,"[920.0, 739.0, 677.0, 4093.0, 3808.0, 1745.0, ..."
46064,AEQDXMB264KG4OZUZL7NAJFZGADA,B001TOQ8NO,0.0,1466618460000,5452,3466,Video Games,Wii MotionPlus,[],"[Video Games, Legacy Systems, Nintendo Systems...",46.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3138.0, 3..."
152474,AEYD3JIWG72NBXR4BXSEZX3ZMA6Q,B06XHMPPLH,0.0,1417535860000,18134,4019,Video Games,NieR: Automata - Playstation 4,[],"[Video Games, PlayStation 4, Games]",25.93,"[506.0, 3116.0, 673.0, 1272.0, 4315.0, 1973.0,..."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,6010,928,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 237..."


In [20]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
46284,AG4RCXKPTC6QRORJLUSBY4SO2IAA,B001F8188Y,0.0,1640980885681,5473,2668,Video Games,Nintendo DS Lite Cobalt / Black,"[Product Description, Nintendo DS Lite Cobalt ...","[Video Games, Legacy Systems, Nintendo Systems...",117.99,"[-1, -1, 3287, 169, 2617, 2413, 3088, 3283, 28..."
69120,AHH7QSUK7NCWVYXT4DGC3WD2QMAQ,B07GP2CV85,0.0,1646754495061,8164,2501,Video Games,Metro Exodus: Day One Edition - PlayStation 4,"[Metro Exodus is an epic, story driven first p...","[Video Games, PlayStation 4, Games]",39.99,"[3105, 2926, 4020, 1750, 880, 2559, 689, 4467,..."
41666,AGMBK7EWZQWYI74JQMATQGZZRFUQ,B0012N7BSI,0.0,1628828874493,4941,3774,Video Games,Madden NFL 09 - Playstation 3,"[Product description, Madden NFL 09 delivers i...","[Video Games, Legacy Systems, PlayStation Syst...",21.06,"[-1, -1, -1, 2094, 2112, 161, 1649, 1920, 1729..."
39933,AF2L5SZ5P34FX44NEOYKLMY6WSTA,B0080CAOYM,0.0,1629274141747,4741,2067,Video Games,Call of Duty: Black Ops 2 [Download],"[Zombies, The Biggest Zombies Experience to Da...","[Video Games, PC, Games]",,"[-1, -1, -1, -1, 612, 2050, 2080, 3583, 785, 201]"
74,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B0BLFYF8K2,4.0,1630263342566,17628,297,Computers,"Logitech G600 MMO Gaming Mouse, RGB Backlit, 2...","[With 20 buttons, the Logitech G600 MMO Gaming...","[Video Games, PC, Accessories, Gaming Mice]",37.99,"[2186, 3468, 4604, 4223, 2586, 2262, 1549, 199..."
...,...,...,...,...,...,...,...,...,...,...,...,...
532,AFUWPAK6VCGEL2OVIL2YGZNFQJZQ,B08N6NCR3Q,4.0,1642699950266,16864,76,Video Games,Thrustmaster T 16000M SPACE SIM DUO STICK (PC),[The THRUSTMASTER T.16000M FCS Space Sim Duo c...,"[Video Games, PC, Accessories, Controllers, Fl...",119.51,"[-1, -1, -1, -1, 4239, 2038, 3617, 4672, 2949,..."
815,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,6281,2502,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,"[-1, 4391, 2658, 2729, 3312, 3222, 3759, 1892,..."
280,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,8504,4485,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,"[-1, -1, -1, -1, -1, 4395, 2664, 678, 1806, 4185]"
142915,AHUA2BO4FFD3VLGEOIF4TGJYCOBQ,B01GY35GIM,0.0,1647432520927,16980,758,Video Games,Titanfall 2 - SteelBook Edition - PlayStation 4,"[I bought it about 2 weeks ago, now I have two...","[Video Games, PlayStation 4, Games]",,"[2375, 1501, 4639, 2856, 4680, 4401, 950, 4074..."


# Checks

In [21]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
assert (
    check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2
), "Number of pos and neg samples are not equal"

[32m2024-09-25 02:28:10.930[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AFRDXCCVRHZ73RSR7NKTIV2AEA5A...[0m


In [22]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert (
    train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5
), f"Item {item} does not appear much in training data"

[32m2024-09-25 02:28:10.969[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B07X6KDQ98...[0m


## Random eye-ball

In [23]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
62508,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B002Q0AT44,0.0,1587431053852,7420,3003,All Electronics,BLUE GLOW in DARK Xbox 360 Game Controller Sil...,[],"[Video Games, Legacy Systems, Xbox Systems, Xb...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
163049,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B06X3WTK4X,5.0,1587431053852,7420,1882,Video Games,amFilm Anti-Glare Matte Screen Protector for N...,[Introducing amFilm Anti Glare Matte Screen Pr...,"[Video Games, Nintendo Switch, Accessories, Fa...",7.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
163050,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B07BWJWBJL,5.0,1587431487268,7420,3228,Video Games,Octopath Traveler,[Eight travelers. Eight adventures. Eight role...,"[Video Games, Nintendo Switch, Games]",56.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
62509,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B0088TN5FM,0.0,1587431487268,7420,508,Video Games,Tom Clancy's Splinter Cell Blacklist(XBox 360),"[Product Description, A group of terrorists ca...","[Video Games, Legacy Systems, Xbox Systems, Xb...",6.29,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
164340,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B0BRXYDXP7,5.0,1594156273381,7420,1191,Video Games,"Satisfye - ZenGrip Pro Gen 3 OLED, a Switch Gr...",[],"[Video Games, Nintendo Switch, Accessories, Ha...",44.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
62510,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B002CZ38KA,0.0,1594156273381,7420,3514,Video Games,Heavy Rain - Greatest Hits,"[Product Description, Experience a gripping ps...","[Video Games, Legacy Systems, PlayStation Syst...",7.66,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
62511,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B001G6062E,0.0,1594156374549,7420,1219,Video Games,Prey - Xbox 360,"[From the Manufacturer, Tommy is a Cherokee ga...","[Video Games, Legacy Systems, Xbox Systems, Xb...",31.9,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 188..."
164341,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B06Y2FNRQP,5.0,1594156374549,7420,394,Video Games,Cave Story+ - Nintendo Switch,"[""Arguably the most well-known indie game of a...","[Video Games, Nintendo Switch, Games]",62.17,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 188..."
165254,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B07114C4TL,5.0,1599402559941,7420,3213,All Electronics,NES Classic Controller Extension Cable 3M / 10...,[],"[Video Games, Legacy Systems, Nintendo Systems...",12.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1882.0, 3..."
62512,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B00C1TTF86,0.0,1599402559941,7420,1235,Video Games,Battlefield 4 - Playstation 3,"[Battlefield 4, is the genre-defining action b...","[Video Games, Legacy Systems, PlayStation Syst...",15.22,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1882.0, 3..."


In [24]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
186,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B07X6KDQ98,1.0,1632974125340,7420,3891,Toys & Games,Marseille mClassic Plug-and-Play Video Game Co...,[Marseille's mClassic upscaler is the world’s ...,"[Video Games, Legacy Systems, Nintendo Systems...",99.99,"[-1, -1, -1, 1882, 3228, 1191, 394, 3213, 754,..."
62515,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B000PC4FYK,0.0,1632974125340,7420,2713,Video Games,Eternal Sonata - Xbox 360,"[On his deathbed, Chopin, the famous composer,...","[Video Games, Legacy Systems, Xbox Systems, Xb...",36.43,"[-1, -1, -1, 1882, 3228, 1191, 394, 3213, 754,..."
379,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B07MYVF61Y,5.0,1638814050768,7420,3694,All Electronics,LevelHike HDMI Cable for Playstation 2 & Plays...,[],"[Video Games, Legacy Systems, PlayStation Syst...",29.99,"[-1, -1, 1882, 3228, 1191, 394, 3213, 754, 651..."
62516,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B00PIEI1DG,0.0,1638814050768,7420,4568,Video Games,Nintendo amibo Sheik (Super Smash Brothers ser...,[Sheik Supercharge your gameplay with amiibo! ...,"[Video Games, Legacy Systems, Nintendo Systems...",54.49,"[-1, -1, 1882, 3228, 1191, 394, 3213, 754, 651..."
62517,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B003EZ1KGQ,0.0,1654229552131,7420,1776,Video Games,Spec Ops: The Line - PC,"[Product Description, Spec Ops: The Line is th...","[Video Games, PC, Games]",14.69,"[-1, 1882, 3228, 1191, 394, 3213, 754, 651, 38..."
850,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B00HM3QAWK,5.0,1654229552131,7420,253,Video Games,"Hyperkin ""GN6"" Premium Controller for Genesis",[Replace your broken or lost controller with t...,"[Video Games, Legacy Systems, Sega Systems, Se...",12.99,"[-1, 1882, 3228, 1191, 394, 3213, 754, 651, 38..."
62518,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B001I75UIQ,0.0,1657504811792,7420,3828,Video Games,Infinite Undiscovery - Xbox 360,"[Mistaken for a renowned hero, Capell is thrus...","[Video Games, Legacy Systems, Xbox Systems, Xb...",21.88,"[1882, 3228, 1191, 394, 3213, 754, 651, 3891, ..."
935,AFRDXCCVRHZ73RSR7NKTIV2AEA5A,B089VKWRVP,5.0,1657504811792,7420,2977,Video Games,The Outer Worlds Expansion Pass - Nintendo Swi...,[The Outer Worlds Expansion Pass includes two ...,"[Video Games, Nintendo Switch, Games]",24.99,"[1882, 3228, 1191, 394, 3213, 754, 651, 3891, ..."


# Persist

In [25]:
full_df.to_parquet("../data/full_features_neg_sampling_df.parquet", index=False)

In [26]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)