# Negative sampling

More label data can tremendously help model to learn. In this notebook we would prepare negative samples for each user based on the unseen items. The unseen items are sampled based on how frequent they appear in the training dataset. This is an intentional choice to make the negative sample harder, hence potentially more useful. This would also force the model learn relevant patterns about user behaviors rather than biased by popularity.

# Set up

In [1]:
import os
import sys

import pandas as pd
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.negative_sampling import add_features_to_neg_df, generate_negative_samples

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-sample-rich-data"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-sample-rich-data",
  "notebook_persist_dp": "/home/jupyter/frostmourne/reco-algo/notebooks/data/000-sample-rich-data",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

In [4]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df)
neg_df = add_features_to_neg_df(df, neg_df, "user_indice", "timestamp")

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
neg_df.sort_values(["user_indice", args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,105,0,1
1,1,106,0,2
2,1,104,0,4
3,2,106,0,1
4,2,103,0,2
5,3,104,0,1
6,3,101,0,5


# Load data

In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [8]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-09-24 06:22:46.683[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=np.int64(1628641464793)[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
0,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,B00001IVB4,5.0,942965209000,19950,2358,Video Games,Sim Theme Park - PC,[],"[Video Games, PC, Games]",35.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,B00002NDRY,3.0,947856017000,6054,2091,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,364,1662,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B001EYUWY0,5.0,952016747000,1800,4192,Video Games,Unreal Tournament - PlayStation 2,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, PlayStation Syst...",41.53,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B00001KUII,5.0,952143204000,1800,3603,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
944,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,B08F4C6HCD,5.0,1657816667680,3609,3765,Video Games,Legend of Zelda Link's Awakening - Nintendo Sw...,"[“Castaway, you should know the truth!” As Lin...","[Video Games, Nintendo Switch, Games]",59.88,"[3085, 650, 4695, 4243, 4067, 992, 3646, 1609,..."
945,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,B00DBDPOZ4,5.0,1657855227062,7385,2174,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,"[-1, -1, -1, -1, -1, 3613, 604, 1230, 3026, 2596]"
946,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,14871,1187,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, 3002, 2444, 601, 4161, 3940, 3084..."
947,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B00ZJBSBD8,5.0,1657945454164,18008,3094,Video Games,Trackmania Turbo-Nla,[Step into the wild car fantasy world of Track...,"[Video Games, PlayStation 4, Games]",13.68,"[4420, 4027, 1762, 3130, 2766, 4588, 2672, 141..."


In [10]:
features = ["item_sequence", "user_id"]

neg_df = generate_negative_samples(
    full_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    neg_label=0,
    seed=args.random_seed,
)
neg_ts_df = add_features_to_neg_df(
    full_df, neg_df, "user_indice", args.timestamp_col, features
)
neg_ts_df

Generating Negative Samples:   0%|          | 0/20366 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,1193,0,1397005512000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGIXUKLNBPEVTBQDH5BJU4KIX4A
1,0,214,0,1397005861000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGIXUKLNBPEVTBQDH5BJU4KIX4A
2,0,3245,0,1397006180000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGIXUKLNBPEVTBQDH5BJU4KIX4A
3,0,205,0,1397006323000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 404...",AHGIXUKLNBPEVTBQDH5BJU4KIX4A
4,0,543,0,1397006532000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4044.0, 4...",AHGIXUKLNBPEVTBQDH5BJU4KIX4A
...,...,...,...,...,...,...
170989,20365,2155,0,1413437809000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHOZTLYXJAA7WJ66OQ6HD4GT5OAQ
170990,20365,3915,0,1417757814000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHOZTLYXJAA7WJ66OQ6HD4GT5OAQ
170991,20365,1490,0,1417757950000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHOZTLYXJAA7WJ66OQ6HD4GT5OAQ
170992,20365,2441,0,1417758441000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 184...",AHOZTLYXJAA7WJ66OQ6HD4GT5OAQ


# Join with features

Populate the feature values for the newly created negative samples

In [11]:
# Item features

not_item_feature_cols = (
    args.user_col,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    *features,
)
item_features = [col for col in full_df.columns if col not in not_item_feature_cols]
item_features

['parent_asin', 'main_category', 'title', 'description', 'categories', 'price']

In [12]:
neg_ts_df = pd.merge(
    neg_ts_df,
    full_df[["item_indice", *item_features]].drop_duplicates(subset=["item_indice"]),
    how="left",
    on=["item_indice"],
    validate="m:1",
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin,main_category,title,description,categories,price
0,0,1193,0,1397005512000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGIXUKLNBPEVTBQDH5BJU4KIX4A,B01N3ASPNV,All Electronics,amFilm Tempered Glass Screen Protector for Nin...,[],"[Video Games, Nintendo Switch, Accessories, Fa...",8.91
1,0,214,0,1397005861000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGIXUKLNBPEVTBQDH5BJU4KIX4A,B071HVWD55,Computers,"UHURU Wireless Gaming Mouse Up to 10000 DPI, R...",[],"[Video Games, PC, Accessories, Gaming Mice]",19.19
2,0,3245,0,1397006180000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHGIXUKLNBPEVTBQDH5BJU4KIX4A,B0002XL3BA,Video Games,God of War - PlayStation 2,[Unleash the power of the Gods and embark on a...,"[Video Games, Legacy Systems, PlayStation Syst...",17.98
3,0,205,0,1397006323000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 404...",AHGIXUKLNBPEVTBQDH5BJU4KIX4A,B0166QDJDQ,Video Games,Far Cry Primal - PlayStation 4 Standard Edition,"[Welcome to the Stone Age, a time of extreme d...","[Video Games, PlayStation 4, Games]",17.79
4,0,543,0,1397006532000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4044.0, 4...",AHGIXUKLNBPEVTBQDH5BJU4KIX4A,B0055202HE,Video Games,UNCHARTED Greatest Hits Dual Pack - Playstation 3,[Take an adventure with fortune hunter Nathan ...,"[Video Games, Legacy Systems, PlayStation Syst...",27.3
...,...,...,...,...,...,...,...,...,...,...,...,...
170989,20365,2155,0,1413437809000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHOZTLYXJAA7WJ66OQ6HD4GT5OAQ,B00004SVXW,Video Games,The Legend of Zelda,"[Product Description, Ganon, the King of Evil,...","[Video Games, Legacy Systems, Nintendo Systems...",53.25
170990,20365,3915,0,1417757814000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHOZTLYXJAA7WJ66OQ6HD4GT5OAQ,B001G7PRS6,Video Games,Alpha Protocol - Playstation 3,"[Product Description, The year is 2009. Hundre...","[Video Games, Legacy Systems, PlayStation Syst...",35.0
170991,20365,1490,0,1417757950000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHOZTLYXJAA7WJ66OQ6HD4GT5OAQ,B00000JRSB,Video Games,Final Fantasy VII - PlayStation,"[Amazon.com, Long recognized as role-playing g...","[Video Games, Legacy Systems, PlayStation Syst...",33.99
170992,20365,2441,0,1417758441000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 184...",AHOZTLYXJAA7WJ66OQ6HD4GT5OAQ,B001ELJDXE,Video Games,Ghostbusters: The Video Game - Playstation 3,"[Product Description, When Manhattan is once a...","[Video Games, Legacy Systems, PlayStation Syst...",38.69


In [13]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]["item_indice"]
assert (
    len(set(neg_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_df.loc[lambda df: df[args.item_col].eq(item)][
    "item_indice"
]
assert (
    len(set(original_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2024-09-24 06:23:14.144[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B003P9VWPA...[0m


# Concating positive data with negative samples

In [14]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(
    frac=1, replace=False, random_state=args.random_seed
)

In [15]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AGHM4EAPVTF3AT2KYZXOK6HKNOXA,B0001ZZNME,0.0,1407432084000,13171,2286,Video Games,The Legend of Zelda - Classic NES Series,"[From the Manufacturer, Embark on a quest to f...","[Video Games, Legacy Systems, Nintendo Systems...",52.49,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
65458,AHA4MXZFGXAJ5FG734SZJOSR2U3Q,B00TEDK8FQ,0.0,1471888794000,7740,2636,Video Games,Ortz PS4 Vertical Stand with Cooling Fan [Keep...,[],"[Video Games, PlayStation 4, Accessories, Case...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 331..."
79874,AGHEHQXPA2XCTMDO2LIXNU3SEOBA,B00004U5VI,0.0,1489203706000,9408,306,Video Games,Gauntlet: Dark Legacy,[],"[Video Games, Legacy Systems, PlayStation Syst...",54.6,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 327..."
102744,AE6DGY6HUKOK7DFMO27RZBQYYLCA,B01LP5V5CU,0.0,1321017752000,12163,1237,Video Games,Controller Gear Officially Licensed Gears of W...,[Officially Licensed and Exclusively Designed ...,"[Video Games, Xbox One, Accessories, Controlle...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2137.0, 4..."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,13360,1830,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2826.0, 2..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AGA2NNJEVTFU5RPOJEFUEE2D7K2Q,B00W435BL4,0.0,1531255931610,9421,686,Video Games,Madden NFL 16 - PlayStation 3,[Be The Playmaker with Madden NFL 16],"[Video Games, Legacy Systems, PlayStation Syst...",35.03,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 422..."
46064,AHF35NTC74W2QLLY442BYFZV2NZQ,B00BN5T30E,0.0,1508829250207,5469,1145,Video Games,Assassin's Creed IV Black Flag - PC,"[From the Manufacturer, Assassin's Creed, ®, I...","[Video Games, PC, Games]",43.89,"[3745.0, 3352.0, 3441.0, 3951.0, 3851.0, 1451...."
152474,AFEB7PR5ZKA7OLF2YBVAZJTS6VFQ,B07D36WX84,0.0,1593471355138,18180,2056,All Electronics,FastSnail Controller Charger Compatible with N...,[],"[Video Games, Legacy Systems, Nintendo Systems...",19.99,"[1472.0, 3445.0, 4295.0, 2897.0, 1058.0, 610.0..."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,1452,6,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 316..."


In [16]:
key_cols = [
    args.user_col,
    args.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    args.rating_col,
    args.timestamp_col,
]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [17]:
val_timestamp

np.int64(1628641464793)

# Split back train test

In [18]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [19]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AGHM4EAPVTF3AT2KYZXOK6HKNOXA,B0001ZZNME,0.0,1407432084000,13171,2286,Video Games,The Legend of Zelda - Classic NES Series,"[From the Manufacturer, Embark on a quest to f...","[Video Games, Legacy Systems, Nintendo Systems...",52.49,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
65458,AHA4MXZFGXAJ5FG734SZJOSR2U3Q,B00TEDK8FQ,0.0,1471888794000,7740,2636,Video Games,Ortz PS4 Vertical Stand with Cooling Fan [Keep...,[],"[Video Games, PlayStation 4, Accessories, Case...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 331..."
79874,AGHEHQXPA2XCTMDO2LIXNU3SEOBA,B00004U5VI,0.0,1489203706000,9408,306,Video Games,Gauntlet: Dark Legacy,[],"[Video Games, Legacy Systems, PlayStation Syst...",54.6,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 327..."
102744,AE6DGY6HUKOK7DFMO27RZBQYYLCA,B01LP5V5CU,0.0,1321017752000,12163,1237,Video Games,Controller Gear Officially Licensed Gears of W...,[Officially Licensed and Exclusively Designed ...,"[Video Games, Xbox One, Accessories, Controlle...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2137.0, 4..."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,13360,1830,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2826.0, 2..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AGA2NNJEVTFU5RPOJEFUEE2D7K2Q,B00W435BL4,0.0,1531255931610,9421,686,Video Games,Madden NFL 16 - PlayStation 3,[Be The Playmaker with Madden NFL 16],"[Video Games, Legacy Systems, PlayStation Syst...",35.03,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 422..."
46064,AHF35NTC74W2QLLY442BYFZV2NZQ,B00BN5T30E,0.0,1508829250207,5469,1145,Video Games,Assassin's Creed IV Black Flag - PC,"[From the Manufacturer, Assassin's Creed, ®, I...","[Video Games, PC, Games]",43.89,"[3745.0, 3352.0, 3441.0, 3951.0, 3851.0, 1451...."
152474,AFEB7PR5ZKA7OLF2YBVAZJTS6VFQ,B07D36WX84,0.0,1593471355138,18180,2056,All Electronics,FastSnail Controller Charger Compatible with N...,[],"[Video Games, Legacy Systems, Nintendo Systems...",19.99,"[1472.0, 3445.0, 4295.0, 2897.0, 1058.0, 610.0..."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,1452,6,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 316..."


In [20]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
108713,AGXTHABHPC3XO4VAMCFM2TQR3GFQ,B00TEDK8FQ,0.0,1643101921864,12853,2636,Video Games,Ortz PS4 Vertical Stand with Cooling Fan [Keep...,[],"[Video Games, PlayStation 4, Accessories, Case...",,"[111, 3920, 3879, 3261, 3402, 1230, 2239, 3974..."
1248,AESD4RLWUKM6JTD6SNNWYLHLLQQA,B07BMRGKX2,0.0,1653590691326,141,253,Video Games,Agony - PlayStation 4,"[Agony is a first-person, survival horror game...","[Video Games, PlayStation 4, Games]",28.0,"[99, 4672, 4434, 1551, 1561, 2497, 3615, 3196,..."
117972,AEXFEQ7QOP6EHDEZ3K6NN27MQ7KA,B0774N9JKW,0.0,1651718479413,13969,1217,Video Games,Sword Art Online: Hollow Realization - PlaySta...,"[""Link start"" into SWORD ART ONLINE -Hollow Re...","[Video Games, PlayStation 4, Games]",18.11,"[1016, 3999, 2944, 742, 3161, 3580, 2267, 3623..."
156090,AGDAPPCYV472FOUKDGAHZRW766GA,B07B416X7V,0.0,1649310595659,18594,3519,Video Games,Burnout Paradise Remastered - Xbox One [Digita...,[Make action your middle name as you rule the ...,"[Video Games, Xbox One, Games]",,"[3097, 2497, 3203, 3937, 3803, 2323, 2310, 751..."
42283,AG2KBJG5DMEIISPJVF3OVMRB4ALA,B001D8Q5MA,0.0,1636861380056,5042,3122,Video Games,Grand Theft Auto IV [Online Game Code],"[From the Manufacturer, What does the American...","[Video Games, PC]",,"[-1, -1, -1, -1, -1, 3602, 22, 1193, 2486, 1293]"
...,...,...,...,...,...,...,...,...,...,...,...,...
668,AGVBR47VU2BE4EVWFAXOO26SOWSA,B0C39GFK7P,1.0,1647361584062,388,4128,Computers,"Logitech G640 Large Cloth Gaming Mouse Pad, Op...",[The cloth surface of G640 provides ideal surf...,"[Video Games, PC, Accessories, Gaming Mice]",29.99,"[1938, 2662, 262, 3903, 3610, 1896, 1372, 4160..."
532,AFUWPAK6VCGEL2OVIL2YGZNFQJZQ,B08N6NCR3Q,4.0,1642699950266,4205,3269,Video Games,Thrustmaster T 16000M SPACE SIM DUO STICK (PC),[The THRUSTMASTER T.16000M FCS Space Sim Duo c...,"[Video Games, PC, Accessories, Controllers, Fl...",119.51,"[-1, -1, -1, -1, 1058, 3558, 377, 1187, 2169, ..."
815,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,20004,4190,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,"[-1, 832, 3126, 1490, 4335, 2035, 1270, 605, 3..."
280,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,4984,2257,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,"[-1, -1, -1, -1, -1, 4269, 4366, 396, 3060, 464]"


# Checks

In [21]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
assert (
    check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2
), "Number of pos and neg samples are not equal"

[32m2024-09-24 06:23:14.579[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AFOVDID7KNOS7JMVTFQTC7YBQ7MA...[0m


In [22]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert (
    train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5
), f"Item {item} does not appear much in training data"

[32m2024-09-24 06:23:14.618[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B09JY72CNG...[0m


## Random eye-ball

In [23]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
29293,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B005OGPTSS,0.0,1452895167000,3540,1067,Video Games,Red Dead Redemption: Game of the Year Edition ...,"[Red Dead Redemption is a Western epic, set at...","[Video Games, Legacy Systems, Xbox Systems, Xb...",28.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
94898,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B07BM92KT4,5.0,1452895167000,3540,3806,Computers,zelotes 5500 DPI 7 Button LED Optical USB Wire...,"[Oerview:, This is a new 5500 DPI 7 Button USB...","[Video Games, PC, Accessories, Gaming Mice]",11.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
94899,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B0086VPUHI,5.0,1452895223000,3540,3627,Video Games,Grand Theft Auto V: Premium Edition - Xbox One...,[The Grand Theft Auto V: Premium Edition inclu...,"[Video Games, Xbox One, Games]",15.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
29294,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B002B1TDV8,0.0,1452895223000,3540,4498,Video Games,Scribblenauts - Nintendo DS,"[Product Description, Scribblenauts is a compl...","[Video Games, Legacy Systems, Nintendo Systems...",29.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
94900,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B018HVSPLY,5.0,1452895324000,3540,3079,Video Games,Steam Gift Card - $50,[Steam 50 dollar Wallet Card. Instant Access t...,"[Video Games, PC]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
29295,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B005GISQX4,0.0,1452895324000,3540,1936,Video Games,Borderlands 2 - 4 Pack [Download],"[Borderlands 2, is a hybrid First-Person Shoot...","[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
29296,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B014R4KYMS,0.0,1539194243844,3540,3920,Video Games,Uncharted 4: A Thief's End - PlayStation 4,[Uncharted comes to the PlayStation 4.Uncharte...,"[Video Games, PlayStation 4, Games]",24.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 380..."
145422,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B074FJYPJB,5.0,1539194243844,3540,2218,Video Games,$20 Battle.net Store Gift Card Balance - Blizz...,[Pay for games and services directly through B...,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 380..."
157070,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B06ZXXH1TT,5.0,1568754729827,3540,3688,Video Games,World of Warcraft (Battle Chest Box) - PC/Mac ...,[Descend into the World of Warcraft and join t...,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3806.0, 3..."
29297,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B01GY35UK6,0.0,1568754729827,3540,2075,Video Games,Resident Evil 7: Biohazard - PlayStation 4,[Set within a sinister plantation mansion in m...,"[Video Games, PlayStation 4, Games]",17.57,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3806.0, 3..."


In [24]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
146,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B09JY72CNG,5.0,1632083048217,3540,632,Computers,Razer Goliathus Extended Chroma Gaming Mouse P...,[The Razer Goliathus extended Chroma soft gami...,"[Video Games, PC, Accessories, Gaming Mice]",59.99,"[-1, -1, -1, -1, -1, 3806, 3627, 3079, 2218, 3..."
29298,AFOVDID7KNOS7JMVTFQTC7YBQ7MA,B0009XEC02,0.0,1632083048217,3540,3693,Video Games,Advance Wars: Dual Strike - Nintendo DS,[Advance Wars: Dual Strike is the latest in th...,"[Video Games, Legacy Systems, Nintendo Systems...",84.99,"[-1, -1, -1, -1, -1, 3806, 3627, 3079, 2218, 3..."


# Persist

In [25]:
full_df.to_parquet("../data/full_features_neg_sampling_df.parquet", index=False)

In [26]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)