# Negative sampling

More label data can tremendously help model to learn. In this notebook we would prepare negative samples for each user based on the unseen items. The unseen items are sampled based on how frequent they appear in the training dataset. This is an intentional choice to make the negative sample harder, hence potentially more useful. This would also force the model learn relevant patterns about user behaviors rather than biased by popularity.

# Set up

In [1]:
import os
import sys

import pandas as pd
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.negative_sampling import add_features_to_neg_df, generate_negative_samples

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-sample-rich-data"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-sample-rich-data",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/000-sample-rich-data",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

In [4]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df)
neg_df = add_features_to_neg_df(df, neg_df, "user_indice", "timestamp")

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
neg_df.sort_values(["user_indice", args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
1,1,105,0,2
2,1,106,0,4
3,2,103,0,1
4,2,102,0,2
5,3,101,0,1
6,3,104,0,5


# Load data

In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [8]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-09-21 23:42:31.204[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=np.int64(1628641464793)[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
0,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,B00001IVB4,5.0,942965209000,10907,1154,Video Games,Sim Theme Park - PC,[],"[Video Games, PC, Games]",35.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,B00002NDRY,3.0,947856017000,1383,3059,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,2698,4015,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B001EYUWY0,5.0,952016747000,8899,2873,Video Games,Unreal Tournament - PlayStation 2,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, PlayStation Syst...",41.53,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B00001KUII,5.0,952143204000,8899,3963,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
944,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,B08F4C6HCD,5.0,1657816667680,12287,3316,Video Games,Legend of Zelda Link's Awakening - Nintendo Sw...,"[“Castaway, you should know the truth!” As Lin...","[Video Games, Nintendo Switch, Games]",59.88,"[3447, 928, 3817, 2783, 3978, 3042, 2199, 2717..."
945,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,B00DBDPOZ4,5.0,1657855227062,5263,4030,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,"[-1, -1, -1, -1, -1, 826, 4046, 2798, 512, 3847]"
946,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,1380,32,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, 2092, 3505, 2594, 4336, 4023, 272..."
947,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B00ZJBSBD8,5.0,1657945454164,20279,1722,Video Games,Trackmania Turbo-Nla,[Step into the wild car fantasy world of Track...,"[Video Games, PlayStation 4, Games]",13.68,"[3827, 759, 2376, 3337, 2196, 1980, 2, 2266, 4..."


In [12]:
features = ["item_sequence", "user_id"]

neg_df = generate_negative_samples(
    full_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    neg_label=0,
    seed=args.random_seed,
)
neg_ts_df = add_features_to_neg_df(
    full_df, neg_df, "user_indice", args.timestamp_col, features
)
neg_ts_df

Generating Negative Samples:   0%|          | 0/20366 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,1203,0,1357787828000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEAUN4PWPMPF3LUGFTXYFAGUSGBQ
1,0,239,0,1358211486000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEAUN4PWPMPF3LUGFTXYFAGUSGBQ
2,0,3119,0,1358211645000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEAUN4PWPMPF3LUGFTXYFAGUSGBQ
3,0,229,0,1358212487000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 231...",AEAUN4PWPMPF3LUGFTXYFAGUSGBQ
4,0,589,0,1358212490000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2314.0, 1...",AEAUN4PWPMPF3LUGFTXYFAGUSGBQ
...,...,...,...,...,...,...
170989,20365,1522,0,1529802080593,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGPMORSNV7AP6BTRDJ6CQ2EKAMSA
170990,20365,4555,0,1529802176152,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGPMORSNV7AP6BTRDJ6CQ2EKAMSA
170991,20365,106,0,1529802191746,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 138...",AGPMORSNV7AP6BTRDJ6CQ2EKAMSA
170992,20365,2131,0,1529802461853,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 138.0, 23...",AGPMORSNV7AP6BTRDJ6CQ2EKAMSA


# Join with features

Populate the feature values for the newly created negative samples

In [13]:
# Item features

not_item_feature_cols = (
    args.user_col,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    *features,
)
item_features = [col for col in full_df.columns if col not in not_item_feature_cols]
item_features

['parent_asin', 'main_category', 'title', 'description', 'categories', 'price']

In [14]:
neg_ts_df = pd.merge(
    neg_ts_df,
    full_df[["item_indice", *item_features]].drop_duplicates(subset=["item_indice"]),
    how="left",
    on=["item_indice"],
    validate="m:1",
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin,main_category,title,description,categories,price
0,0,1203,0,1357787828000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEAUN4PWPMPF3LUGFTXYFAGUSGBQ,B00JWSJ6G0,Video Games,Wii Motion Plus - Black (Bulk Packaging),[Simple. Intuitive. Easy to use. Connect the W...,"[Video Games, Legacy Systems, Nintendo Systems...",16.99
1,0,239,0,1358211486000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEAUN4PWPMPF3LUGFTXYFAGUSGBQ,B000TVT8PI,Video Games,PlayStation 3 80GB MotorStorm Bundle,[As DVD playback made the PlayStation 2 more t...,"[Video Games, Legacy Systems, PlayStation Syst...",589.99
2,0,3119,0,1358211645000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEAUN4PWPMPF3LUGFTXYFAGUSGBQ,B00FQDEKM0,,Retro-Bit Super RetroTRIO Console NES/SNES/Gen...,[Get a nostalgic classic gaming fix with the S...,"[Video Games, Legacy Systems, Sega Systems, Se...",
3,0,229,0,1358212487000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 231...",AEAUN4PWPMPF3LUGFTXYFAGUSGBQ,B07YBX7Y3P,Video Games,Fallout 76 - Xbox One Tricentennial Edition,[Fallout 76 - Tricentennial Edition for Xbox O...,"[Video Games, Xbox One, Games]",14.98
4,0,589,0,1358212490000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2314.0, 1...",AEAUN4PWPMPF3LUGFTXYFAGUSGBQ,B0051D8QCA,Video Games,NBA 2K12 (Covers May Vary),"[Product Description, With over 5 million copi...","[Video Games, Legacy Systems, Xbox Systems, Xb...",21.77
...,...,...,...,...,...,...,...,...,...,...,...,...
170989,20365,1522,0,1529802080593,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGPMORSNV7AP6BTRDJ6CQ2EKAMSA,B001EYUQDW,Video Games,Borderlands Ultimate Edition - PS3 [Digital Code],"[Product Description, Gearbox Software has dev...","[Video Games, Legacy Systems, PlayStation Syst...",
170990,20365,4555,0,1529802176152,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGPMORSNV7AP6BTRDJ6CQ2EKAMSA,B08VFQ3XJX,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0
170991,20365,106,0,1529802191746,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 138...",AGPMORSNV7AP6BTRDJ6CQ2EKAMSA,B01L1Y0RZQ,Video Games,Xbox One S 500GB Console - Minecraft Bundle [D...,[The Xbox One S Minecraft Favorites Bundle (50...,"[Video Games, Xbox One, Consoles]",214.99
170992,20365,2131,0,1529802461853,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 138.0, 23...",AGPMORSNV7AP6BTRDJ6CQ2EKAMSA,B08392WSFZ,Computers,TNP Protective Case Compatible with Nintendo N...,[],"[Video Games, Legacy Systems, Nintendo Systems...",13.99


In [15]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]["item_indice"]
assert (
    len(set(neg_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_df.loc[lambda df: df[args.item_col].eq(item)][
    "item_indice"
]
assert (
    len(set(original_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2024-09-21 23:45:42.368[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B09JSFLW8V...[0m


# Concating positive data with negative samples

In [16]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(
    frac=1, replace=False, random_state=args.random_seed
)

In [17]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AFFJSIHCRAXI3J3NRBLGKNNCR2RA,B00CXTX2YW,0.0,1515421489907,13258,528,Video Games,Xbox 360 Wireless Controller - Camouflage,[Ambush your opponents with the Xbox 360 Speci...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",40.99,"[4148.0, 400.0, 791.0, 3069.0, 3062.0, 1152.0,..."
65458,AFVONP56RLBX43PFSSG3YCXNLKUQ,B00267S2A0,0.0,1504677642111,7725,4233,Video Games,Call of Duty: Modern Warfare 2 - Playstation 3,"[Product Description, On November 10, 2009, ac...","[Video Games, Legacy Systems, PlayStation Syst...",23.88,"[-1.0, -1.0, -1.0, -1.0, 742.0, 4535.0, 1985.0..."
79874,AFCB6BTUBDB4OFJWXPOITK44EZJA,B01A5BEBX0,0.0,1577800282838,9434,1424,Computers,PECHAM Vertical Stand for PS4 Slim / PS4 with ...,[],"[Video Games, PlayStation 4, Accessories, Cool...",,"[731.0, 3617.0, 2522.0, 393.0, 592.0, 362.0, 2..."
102744,AHNKYFBDA2RBJBABIDP4LBP2W4EQ,B004MPR0ZC,0.0,1397353145000,12208,1919,Video Games,CTA Digital Nintendo 3Ds Cartridge Storage Sol...,"[Product Description, The Nintendo 3DS is the ...","[Video Games, Legacy Systems, Nintendo Systems...",,"[1286.0, 2124.0, 1518.0, 941.0, 1668.0, 1659.0..."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,17057,4555,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 108.0, 24..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AGEBVJUFAAPDBFKJ2DIJGCDDSAEQ,B0036F0V4G,0.0,1452044872000,9445,414,Video Games,Metal Gear Rising Revengeance - Xbox 360,"[Product Description, Product Overview, METAL ...","[Video Games, Legacy Systems, Xbox Systems, Xb...",26.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2702.0, 6..."
46064,AF6ROEEZD4VJPH6D6JFRRC6UZ3LQ,B073232J24,0.0,1282760896000,5419,3449,Video Games,NHL 18 - PlayStation 4,"[EA SPORTS NHL 18 delivers the speed, creativi...","[Video Games, PlayStation 4, Games]",11.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
152474,AG27GY5VFWMIOQLBQY5C6R6FD4JQ,B000A2R54M,0.0,1468348193000,18174,30,Video Games,Mario Kart DS,"[Amazon.com, The ultimate, Mario Kart, race is...","[Video Games, Kids and Family]",37.51,"[-1.0, -1.0, -1.0, 3739.0, 3171.0, 2109.0, 322..."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,10245,4603,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 391..."


In [18]:
key_cols = [
    args.user_col,
    args.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    args.rating_col,
    args.timestamp_col,
]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [19]:
val_timestamp

np.int64(1628641464793)

# Split back train test

In [20]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [21]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AFFJSIHCRAXI3J3NRBLGKNNCR2RA,B00CXTX2YW,0.0,1515421489907,13258,528,Video Games,Xbox 360 Wireless Controller - Camouflage,[Ambush your opponents with the Xbox 360 Speci...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",40.99,"[4148.0, 400.0, 791.0, 3069.0, 3062.0, 1152.0,..."
65458,AFVONP56RLBX43PFSSG3YCXNLKUQ,B00267S2A0,0.0,1504677642111,7725,4233,Video Games,Call of Duty: Modern Warfare 2 - Playstation 3,"[Product Description, On November 10, 2009, ac...","[Video Games, Legacy Systems, PlayStation Syst...",23.88,"[-1.0, -1.0, -1.0, -1.0, 742.0, 4535.0, 1985.0..."
79874,AFCB6BTUBDB4OFJWXPOITK44EZJA,B01A5BEBX0,0.0,1577800282838,9434,1424,Computers,PECHAM Vertical Stand for PS4 Slim / PS4 with ...,[],"[Video Games, PlayStation 4, Accessories, Cool...",,"[731.0, 3617.0, 2522.0, 393.0, 592.0, 362.0, 2..."
102744,AHNKYFBDA2RBJBABIDP4LBP2W4EQ,B004MPR0ZC,0.0,1397353145000,12208,1919,Video Games,CTA Digital Nintendo 3Ds Cartridge Storage Sol...,"[Product Description, The Nintendo 3DS is the ...","[Video Games, Legacy Systems, Nintendo Systems...",,"[1286.0, 2124.0, 1518.0, 941.0, 1668.0, 1659.0..."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,17057,4555,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 108.0, 24..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AGEBVJUFAAPDBFKJ2DIJGCDDSAEQ,B0036F0V4G,0.0,1452044872000,9445,414,Video Games,Metal Gear Rising Revengeance - Xbox 360,"[Product Description, Product Overview, METAL ...","[Video Games, Legacy Systems, Xbox Systems, Xb...",26.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2702.0, 6..."
46064,AF6ROEEZD4VJPH6D6JFRRC6UZ3LQ,B073232J24,0.0,1282760896000,5419,3449,Video Games,NHL 18 - PlayStation 4,"[EA SPORTS NHL 18 delivers the speed, creativi...","[Video Games, PlayStation 4, Games]",11.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
152474,AG27GY5VFWMIOQLBQY5C6R6FD4JQ,B000A2R54M,0.0,1468348193000,18174,30,Video Games,Mario Kart DS,"[Amazon.com, The ultimate, Mario Kart, race is...","[Video Games, Kids and Family]",37.51,"[-1.0, -1.0, -1.0, 3739.0, 3171.0, 2109.0, 322..."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,10245,4603,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 391..."


In [22]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
98196,AHO3T4INZKH5C7IIV5JERA3H2SKQ,B01GY35QPU,0.0,1638057548682,11638,1902,Video Games,Detroit Become Human - PlayStation 4,[Detroit: Become Human is the latest title in ...,"[Video Games, PlayStation 4, Games]",21.4,"[3579, 878, 434, 1579, 3669, 3322, 3495, 2339,..."
124472,AEMYS2WV33NDV3OAJFOM7SSSIDTA,B0088TN7BO,0.0,1640914537652,14788,2180,Video Games,LEGO Lord of the Rings - Nintendo 3DS,"[Product Description, Based on The Lord of the...","[Video Games, Legacy Systems, Nintendo Systems...",19.95,"[-1, -1, -1, -1, -1, 3244, 2872, 2621, 3048, 3..."
166044,AEXKGQQMYQQUNWVGD66TG3VT4V4A,B00IPTUJ8G,0.0,1653120417877,19789,4619,Video Games,Borderlands 2,"[Borderlands 2 for PS Vita, View Larger, View ...","[Video Games, Legacy Systems, PlayStation Syst...",48.1,"[-1, -1, -1, -1, -1, 2358, 3142, 4521, 2292, 2..."
54221,AGQAPJGNR3IDAJCPD2YPU7CRRPBA,B00CMQTVK0,0.0,1643392366551,6414,1765,Video Games,Xbox One with Kinect (Day One Edition),"[Get more with Xbox One., Introducing Xbox One...","[Video Games, Xbox One, Consoles]",589.99,"[-1, -1, -1, -1, 3142, 4274, 4345, 809, 416, 1..."
10185,AGIJWTPKBANKWNEM2AHK7PWTBYLQ,B01BF9X9VQ,0.0,1647513889874,1195,2412,Video Games,World of Warcraft: Legion - Standard Edition -...,"[Kingdoms will burn, The Burning Legion surges...","[Video Games, PC, Games]",28.6,"[170, 3609, 1588, 1467, 3786, 1627, 4234, 3056..."
...,...,...,...,...,...,...,...,...,...,...,...,...
124193,AF5T2J7T33UFSPUGCKTPTGU7EY7A,B008I2LRMC,0.0,1654892101229,14749,1743,Video Games,F1: 2012,"[Product Description, F1 2012 is designed to b...","[Video Games, Legacy Systems, Xbox Systems, Xb...",33.03,"[601, 1842, 3391, 2640, 761, 3624, 1079, 2656,..."
69557,AGXQLR7TWHVUQLDBKKOBI4OQQGRQ,B01N3ASPNV,0.0,1637285457188,8208,3069,All Electronics,amFilm Tempered Glass Screen Protector for Nin...,[],"[Video Games, Nintendo Switch, Accessories, Fa...",8.91,"[-1, -1, 4227, 1039, 1352, 2619, 2655, 1305, 1..."
815,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,9221,2581,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,"[-1, 2705, 3691, 1804, 111, 3330, 1775, 2979, ..."
280,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,4173,3191,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,"[-1, -1, -1, -1, -1, 2379, 1799, 1932, 1568, 161]"


# Checks

In [23]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
assert (
    check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2
), "Number of pos and neg samples are not equal"

[32m2024-09-21 23:46:40.470[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AEPVV7PDGK2R5SDFGFKK7WP4SZCQ...[0m


In [24]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert (
    train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5
), f"Item {item} does not appear much in training data"

[32m2024-09-21 23:46:41.786[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B08BY9XN49...[0m


## Random eye-ball

In [25]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
68723,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B0013RATNM,0.0,1482012841000,8119,587,Video Games,Just Cause 2 - Xbox 360,"[Product Description, Just Cause 2 is a stunni...","[Video Games, Legacy Systems, Xbox Systems, Xb...",33.7,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
115323,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B00RU75I2G,5.0,1482012841000,8119,1974,Video Games,DMC Devil May Cry: Definitive Edition - Xbox One,"[Developed by Ninja Theory, DmC Devil May Cry:...","[Video Games, Xbox One, Games]",22.94,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
68724,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B001ELJDXE,0.0,1483195688000,8119,1968,Video Games,Ghostbusters: The Video Game - Playstation 3,"[Product Description, When Manhattan is once a...","[Video Games, Legacy Systems, PlayStation Syst...",38.69,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
116241,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B00OGNV5HY,5.0,1483195688000,8119,3299,Video Games,Shadow Warrior - Xbox One,[Shadow Warrior tells the offbeat tale of Zill...,"[Video Games, Xbox One, Games]",13.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
118609,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B07X649B77,5.0,1486158091000,8119,4106,Video Games,WB Games Mad Max - Playstation 4,[Become the legendary lone warrior as you craf...,"[Video Games, PlayStation 4, Games]",28.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
68725,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B00KXGZYBG,0.0,1486158091000,8119,1884,Video Games,Final Fantasy Type-0 HD - PlayStation 4,[One of the most highly anticipated role-playi...,"[Video Games, PlayStation 4, Games]",25.69,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
68726,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B07X56TZ16,0.0,1486612697000,8119,2120,Video Games,Lego Batman,"[Play as Batman, Robin or more than 12 dastard...","[Video Games, Legacy Systems, Xbox Systems, Xb...",16.88,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 197..."
118926,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B00LSFRY3C,5.0,1486612697000,8119,1721,Video Games,WARRIORS OROCHI 3 Ultimate - Xbox One,[Fans will experience WARRIORS OROCHI 3 in a b...,"[Video Games, Xbox One, Games]",29.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 197..."
68727,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B07Z9Z39ZW,0.0,1492692693000,8119,546,Video Games,Witcher 3: Wild Hunt Complete Edition - Ninten...,[],"[Video Games, Nintendo Switch, Games]",41.54,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1974.0, 3..."
123014,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B01EORDE48,5.0,1492692693000,8119,4469,Video Games,Attack on Titan - PlayStation 4,[Attack on Titan closely follows the riveting ...,"[Video Games, PlayStation 4, Games]",59.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1974.0, 3..."


In [26]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
68737,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B0086VPUHI,0.0,1629483761562,8119,4239,Video Games,Grand Theft Auto V: Premium Edition - Xbox One...,[The Grand Theft Auto V: Premium Edition inclu...,"[Video Games, Xbox One, Games]",15.0,"[4469, 2793, 429, 4658, 3086, 99, 4459, 2074, ..."
39,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B08BY9XN49,5.0,1629483761562,8119,569,Computers,tomtoc Carrying Case for Nintendo Switch/OLED ...,[],"[Video Games, Nintendo Switch, Accessories, Ca...",30.99,"[4469, 2793, 429, 4658, 3086, 99, 4459, 2074, ..."
87,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B07D36WX84,5.0,1630620706898,8119,1694,All Electronics,FastSnail Controller Charger Compatible with N...,[],"[Video Games, Legacy Systems, Nintendo Systems...",19.99,"[2793, 429, 4658, 3086, 99, 4459, 2074, 3665, ..."
68738,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B000A2R54M,0.0,1630620706898,8119,30,Video Games,Mario Kart DS,"[Amazon.com, The ultimate, Mario Kart, race is...","[Video Games, Kids and Family]",37.51,"[2793, 429, 4658, 3086, 99, 4459, 2074, 3665, ..."
68739,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B0055464UQ,0.0,1655576822799,8119,2863,Video Games,"USB Breakaway Cable for Xbox 360, USB Adapter ...",[USB Breakaway Cable for Xbox 360 Wired Contro...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",5.99,"[429, 4658, 3086, 99, 4459, 2074, 3665, 4549, ..."
897,AEPVV7PDGK2R5SDFGFKK7WP4SZCQ,B00OVBVME2,5.0,1655576822799,8119,1405,Video Games,Sunset Overdrive,[Sunset Overdrive transforms an open-world apo...,"[Video Games, Xbox One, Games]",14.97,"[429, 4658, 3086, 99, 4459, 2074, 3665, 4549, ..."


# Persist

In [27]:
full_df.to_parquet("../data/full_features_neg_sampling_df.parquet", index=False)

In [28]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)