# Negative sampling

More label data can tremendously help model to learn. In this notebook we would prepare negative samples for each user based on the unseen items. The unseen items are sampled based on how frequent they appear in the training dataset. This is an intentional choice to make the negative sample harder, hence potentially more useful. This would also force the model learn relevant patterns about user behaviors rather than biased by popularity.

# Set up

In [1]:
import os
import sys

import pandas as pd
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.negative_sampling import add_features_to_neg_df, generate_negative_samples

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-sample-rich-data"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-sample-rich-data",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/000-sample-rich-data",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

In [4]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df)
neg_df = add_features_to_neg_df(df, neg_df, "user_indice", "timestamp")

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
neg_df.sort_values(["user_indice", args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,105,0,1
1,1,104,0,2
2,1,106,0,4
3,2,103,0,1
4,2,106,0,2
5,3,101,0,1
6,3,104,0,5


# Load data

In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [8]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-10-18 16:50:22.000[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=1628641464793[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
0,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,B00001IVB4,5.0,942965209000,11193,1159,Video Games,Sim Theme Park - PC,[],"[Video Games, PC, Games]",35.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,B00002NDRY,3.0,947856017000,20321,1301,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,26,1043,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B001EYUWY0,5.0,952016747000,1455,657,Video Games,Unreal Tournament - PlayStation 2,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, PlayStation Syst...",41.53,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B00001KUII,5.0,952143204000,1455,219,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
944,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,B08F4C6HCD,5.0,1657816667680,5408,2929,Video Games,Legend of Zelda Link's Awakening - Nintendo Sw...,"[“Castaway, you should know the truth!” As Lin...","[Video Games, Nintendo Switch, Games]",59.88,"[2108, 4347, 1392, 2244, 2194, 1986, 44, 3931,..."
945,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,B00DBDPOZ4,5.0,1657855227062,5315,2713,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,"[-1, -1, -1, -1, -1, 1917, 2652, 3712, 4346, 3..."
946,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,15104,2067,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, 4682, 9, 155, 1559, 3146, 3065, 770]"
947,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B00ZJBSBD8,5.0,1657945454164,16935,2936,Video Games,Trackmania Turbo-Nla,[Step into the wild car fantasy world of Track...,"[Video Games, PlayStation 4, Games]",13.68,"[18, 2498, 2002, 1603, 3391, 1963, 1763, 525, ..."


In [10]:
features = ["item_sequence", "user_id"]

neg_df = generate_negative_samples(
    full_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    neg_label=0,
    seed=args.random_seed,
)
neg_ts_df = add_features_to_neg_df(
    full_df, neg_df, "user_indice", args.timestamp_col, features
)
neg_ts_df

Generating Negative Samples:   0%|          | 0/20366 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,1162,0,1380994694000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGTV74C6FBYCD7GFLHOT67J255TQ
1,0,206,0,1380994760000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGTV74C6FBYCD7GFLHOT67J255TQ
2,0,3186,0,1385495832000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGTV74C6FBYCD7GFLHOT67J255TQ
3,0,195,0,1403280419000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 226...",AGTV74C6FBYCD7GFLHOT67J255TQ
4,0,564,0,1403280506000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2263.0, 1...",AGTV74C6FBYCD7GFLHOT67J255TQ
...,...,...,...,...,...,...
170989,20365,811,0,1484847915000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFHL4OM5CCC7Q26GJL4WDI4NRNOA
170990,20365,2652,0,1494705419000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFHL4OM5CCC7Q26GJL4WDI4NRNOA
170991,20365,2180,0,1494705422000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFHL4OM5CCC7Q26GJL4WDI4NRNOA
170992,20365,3687,0,1494705425000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 344...",AFHL4OM5CCC7Q26GJL4WDI4NRNOA


# Join with features

Populate the feature values for the newly created negative samples

In [11]:
# Item features

not_item_feature_cols = (
    args.user_col,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    *features,
)
item_features = [col for col in full_df.columns if col not in not_item_feature_cols]
item_features

['parent_asin', 'main_category', 'title', 'description', 'categories', 'price']

In [12]:
neg_ts_df = pd.merge(
    neg_ts_df,
    full_df[["item_indice", *item_features]].drop_duplicates(subset=["item_indice"]),
    how="left",
    on=["item_indice"],
    validate="m:1",
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin,main_category,title,description,categories,price
0,0,1162,0,1380994694000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGTV74C6FBYCD7GFLHOT67J255TQ,B0050SWBAE,Video Games,Pikmin 3,"[Product Description, Take command of three ex...","[Video Games, Legacy Systems, Nintendo Systems...",60.24
1,0,206,0,1380994760000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGTV74C6FBYCD7GFLHOT67J255TQ,B07WJ6WP3G,Computers,Redragon S101 Wired RGB Backlit Gaming Keyboar...,[],"[Video Games, PC, Accessories, Gaming Keyboards]",54.99
2,0,3186,0,1385495832000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGTV74C6FBYCD7GFLHOT67J255TQ,B006VR663G,Video Games,The Amazing Spider-Man - Nintendo Wii U,[Harness Spider-Man's powers with Manhattan as...,"[Video Games, Legacy Systems, Nintendo Systems...",79.17
3,0,195,0,1403280419000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 226...",AGTV74C6FBYCD7GFLHOT67J255TQ,B01HE5SXFM,Video Games,7 Days to Die - PlayStation 4,[HOW LONG WILL YOU SURVIVE? With over 2 millio...,"[Video Games, PlayStation 4, Games]",24.48
4,0,564,0,1403280506000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2263.0, 1...",AGTV74C6FBYCD7GFLHOT67J255TQ,B00884S5N4,Video Games,Dead Space 3 – PC Origin [Online Game Code],[],"[Video Games, Legacy Systems, Xbox Systems, Xb...",19.99
...,...,...,...,...,...,...,...,...,...,...,...,...
170989,20365,811,0,1484847915000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFHL4OM5CCC7Q26GJL4WDI4NRNOA,B0C3KJJ6XS,Computers,Razer Nari Ultimate Wireless 7.1 Surround Soun...,[Razer Nari Ultimate: 2.4GHz wireless PC gamin...,"[Video Games, PC, Accessories, Headsets]",100.42
170990,20365,2652,0,1494705419000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFHL4OM5CCC7Q26GJL4WDI4NRNOA,B01LPZM7VI,Video Games,Xbox Wireless Controller – Black,[Experience the enhanced comfort and feel of t...,"[Video Games, Xbox One, Accessories, Controlle...",59.99
170991,20365,2180,0,1494705422000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AFHL4OM5CCC7Q26GJL4WDI4NRNOA,B09W5193NT,Video Games,Oculus Rift + Touch Virtual Reality System,[Rift is unlike anything you've ever experienc...,"[Video Games, PC, Virtual Reality]",272.83
170992,20365,3687,0,1494705425000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 344...",AFHL4OM5CCC7Q26GJL4WDI4NRNOA,B07P2HCW2K,Video Games,Overcooked - PlayStation 4,"[Working as a team, you and your fellow chefs ...","[Video Games, PlayStation 4, Games]",19.98


In [13]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]["item_indice"]
assert (
    len(set(neg_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_df.loc[lambda df: df[args.item_col].eq(item)][
    "item_indice"
]
assert (
    len(set(original_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2024-10-18 16:50:29.956[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B001EYUTC0...[0m


# Concating positive data with negative samples

In [14]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(
    frac=1, replace=False, random_state=args.random_seed
)

In [15]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AGWGLADIA2MYMYSPDA75SN545CKQ,B005UUO9CK,0.0,1536044878471,13310,2437,Video Games,Wipeout 2 - Xbox 360,"[Product Description, Summer thrills and Winte...","[Video Games, Legacy Systems, Xbox Systems, Xb...",21.99,"[1758.0, 4370.0, 3681.0, 1904.0, 3259.0, 1423...."
65458,AFYRLA4FND2RD57F4LCT2JPSJUJQ,B001G6064W,0.0,1600549574482,7718,4374,Video Games,PROTOTYPE - Playstation 3,"[Product Description, You are Alex Mercer, the...","[Video Games, Legacy Systems, PlayStation Syst...",28.0,"[2800.0, 2923.0, 2552.0, 938.0, 3549.0, 3397.0..."
79874,AGEZWX2WLNXUULH5VO67CSMTNCZQ,B00GIK3YAO,0.0,1435176515000,9496,2377,Video Games,PowerA MOGA Hero Power - Electronic Games,"[POWER A MOGA Hero Power - Electronic Games, 1...",[],,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 946.0, 41..."
102744,AHNY6MA63QMAUZISYSW66EURVVMQ,B00DJRLDMU,0.0,1546786802961,12266,1391,Video Games,Tom Clancy’s The Division Underground - Xbox O...,"[In Expansion I, Underground, the factions of ...","[Video Games, Xbox One, Downloadable Content]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,4714,1358,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1232.0, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AFBYT34RXLYJCATKKN6IZKM5ZFTA,B002I0J4NE,0.0,1378255234000,9508,1027,Video Games,PlayStation Move Starter Bundle,[PlayStation Move Starter Bundle],"[Video Games, Legacy Systems, PlayStation Syst...",135.8,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1738.0, 3..."
46064,AE5WZGXHPBTUJYLOHKQMNPWCFCWA,B00KVOVBGM,0.0,1444416370000,5438,2640,Video Games,PlayStation 4 Console - Destiny Bundle [Discon...,"[Product Description, The ultimate Destiny exp...","[Video Games, Game Genre of the Month]",319.98,"[309.0, 98.0, 4507.0, 512.0, 517.0, 34.0, 2068..."
152474,AFVVEGO2FG5KK7ERTEYKTSWO4ETQ,B00CMQTVUA,0.0,1354291798000,18121,4651,Video Games,Xbox One + Kinect,[Experience greater convenience and control wi...,"[Video Games, Xbox One, Accessories, Accessory...",259.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,12236,2877,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 272..."


In [16]:
key_cols = [
    args.user_col,
    args.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    args.rating_col,
    args.timestamp_col,
]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [17]:
val_timestamp

1628641464793

# Split back train test

In [18]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [19]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AGWGLADIA2MYMYSPDA75SN545CKQ,B005UUO9CK,0.0,1536044878471,13310,2437,Video Games,Wipeout 2 - Xbox 360,"[Product Description, Summer thrills and Winte...","[Video Games, Legacy Systems, Xbox Systems, Xb...",21.99,"[1758.0, 4370.0, 3681.0, 1904.0, 3259.0, 1423...."
65458,AFYRLA4FND2RD57F4LCT2JPSJUJQ,B001G6064W,0.0,1600549574482,7718,4374,Video Games,PROTOTYPE - Playstation 3,"[Product Description, You are Alex Mercer, the...","[Video Games, Legacy Systems, PlayStation Syst...",28.0,"[2800.0, 2923.0, 2552.0, 938.0, 3549.0, 3397.0..."
79874,AGEZWX2WLNXUULH5VO67CSMTNCZQ,B00GIK3YAO,0.0,1435176515000,9496,2377,Video Games,PowerA MOGA Hero Power - Electronic Games,"[POWER A MOGA Hero Power - Electronic Games, 1...",[],,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 946.0, 41..."
102744,AHNY6MA63QMAUZISYSW66EURVVMQ,B00DJRLDMU,0.0,1546786802961,12266,1391,Video Games,Tom Clancy’s The Division Underground - Xbox O...,"[In Expansion I, Underground, the factions of ...","[Video Games, Xbox One, Downloadable Content]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,4714,1358,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1232.0, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AFBYT34RXLYJCATKKN6IZKM5ZFTA,B002I0J4NE,0.0,1378255234000,9508,1027,Video Games,PlayStation Move Starter Bundle,[PlayStation Move Starter Bundle],"[Video Games, Legacy Systems, PlayStation Syst...",135.8,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1738.0, 3..."
46064,AE5WZGXHPBTUJYLOHKQMNPWCFCWA,B00KVOVBGM,0.0,1444416370000,5438,2640,Video Games,PlayStation 4 Console - Destiny Bundle [Discon...,"[Product Description, The ultimate Destiny exp...","[Video Games, Game Genre of the Month]",319.98,"[309.0, 98.0, 4507.0, 512.0, 517.0, 34.0, 2068..."
152474,AFVVEGO2FG5KK7ERTEYKTSWO4ETQ,B00CMQTVUA,0.0,1354291798000,18121,4651,Video Games,Xbox One + Kinect,[Experience greater convenience and control wi...,"[Video Games, Xbox One, Accessories, Accessory...",259.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,12236,2877,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 272..."


In [20]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
74,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B0BLFYF8K2,4.0,1630263342566,7087,3330,Computers,"Logitech G600 MMO Gaming Mouse, RGB Backlit, 2...","[With 20 buttons, the Logitech G600 MMO Gaming...","[Video Games, PC, Accessories, Gaming Mice]",37.99,"[1596, 1047, 3310, 2885, 1862, 1299, 4500, 209..."
892,AFFPVZ3JNCTQIKAK4XK37E2ENWWA,B00HVBPRUO,4.0,1655428133046,4199,3721,Video Games,Gold Wireless Stereo Headset - PlayStation 4,[A Headset for Gamers: Experience everything f...,"[Video Games, PlayStation 4, Accessories, Head...",,"[-1, -1, 3810, 3019, 1158, 1714, 98, 309, 2291..."
153673,AHAIICWIZT6PYSS5QJNFYP6ZXLCA,B001EYUPJW,0.0,1628811542081,18270,3274,Video Games,Def Jam Fight for NY - Gamecube (Gold),"[The ultimate hip-hop fueled fighting game, De...","[Video Games, Legacy Systems, Nintendo Systems...",349.97,"[2245, 705, 3089, 4462, 3922, 2808, 3476, 3558..."
87338,AF2AAA4CWRVF2IYVE7WB6OOIEMFA,B072C3VM5F,0.0,1635286957988,10410,4179,Video Games,Far Cry 5 Gold Edition - Xbox One [Digital Code],[],"[Video Games, Xbox One, Games]",,"[-1, -1, -1, -1, 1690, 431, 720, 742, 1885, 2711]"
285,AFBRTNVOROW7UVA66UPX5YCFC6MQ,B07YBXFDYK,3.0,1636189764550,9102,2986,Video Games,The Evil Within 2 - PlayStation 4,"[From Shinji Mikami, The Evil Within 2 takes t...","[Video Games, PlayStation 4, Games]",20.98,"[-1, -1, -1, -1, 1853, 195, 4616, 1777, 4654, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
87618,AGNZODFXG6WNNJUAKR3MI42SDG5A,B0B8RG61GK,0.0,1642617264556,10449,3436,Computers,Fast Charging Cable for Switch/Switch Lite/Swi...,"[Dimensions:, Length: 9.8 Feet/3 Meters]","[Video Games, Legacy Systems, Nintendo Systems...",9.89,"[-1, -1, -1, -1, 2074, 1310, 3877, 184, 2952, ..."
139967,AHA6LZWVG2U4WBXNZRWCESNJXNUA,B002JTX87C,0.0,1646645847748,16655,380,Video Games,Scooby Doo! First Frights NDS,"[Product Description, In Scooby-Doo! First Fri...","[Video Games, Legacy Systems, Nintendo Systems...",43.77,"[1208, 1088, 3365, 4287, 3679, 4048, 2351, 197..."
815,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,11656,4503,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,"[-1, 2159, 3248, 48, 445, 4343, 584, 907, 4155..."
280,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,1562,4543,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,"[-1, -1, -1, -1, -1, 4507, 1351, 132, 4660, 1979]"


# Checks

In [21]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
assert (
    check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2
), "Number of pos and neg samples are not equal"

[32m2024-10-18 16:50:30.095[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ...[0m


In [22]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert (
    train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5
), f"Item {item} does not appear much in training data"

[32m2024-10-18 16:50:30.107[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B0C3QFBBH5...[0m


## Random eye-ball

In [23]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
42367,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B003VAM392,5.0,1385198022000,19830,2362,All Electronics,Logitech Rumble Gamepad F510 With Broad Game S...,"[Product Description, Vibration feedback lets ...","[Video Games, PC, Accessories, Controllers, Ga...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
166730,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B0013LTP5Q,0.0,1385198022000,19830,1689,Video Games,Active Life Outdoor Challenge - Nintendo Wii,"[Product Description, ""The first in the Active...","[Video Games, Legacy Systems, Nintendo Systems...",53.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
42368,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B001EYUQDW,5.0,1385198219000,19830,2730,Video Games,Borderlands Ultimate Edition - PS3 [Digital Code],"[Product Description, Gearbox Software has dev...","[Video Games, Legacy Systems, PlayStation Syst...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
166731,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B00CQ35C1Q,0.0,1385198219000,19830,1351,Computers,Logitech 981-000536 G430 7.1 Gaming Headset wi...,[],"[Video Games, Legacy Systems, Nintendo Systems...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
60398,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B001ELJE8I,5.0,1413670727000,19830,2355,Video Games,Halo: Combat Evolved - PC,"[Product Description, Based off of the same th...","[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
166732,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B07YHGBJY6,0.0,1413670727000,19830,526,Video Games,Plants Vs. Zombies: Battle for Neighborville -...,"[Welcome to Neighbor Ville, where all is well....","[Video Games, PlayStation 4, Games]",12.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
64317,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B06ZXXHMS7,5.0,1417939081000,19830,3341,Video Games,Gears of War - PC,[Gears of War is a third-person shooter that p...,"[Video Games, PC, Games]",77.97,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 236..."
166733,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B002I0K780,0.0,1417939081000,19830,937,Video Games,Little Big Planet 2,"[Product Description, Join Sackboy in a comple...","[Video Games, Legacy Systems, PlayStation Syst...",19.84,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 236..."
166734,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B001ELJDUC,0.0,1417968513000,19830,2416,Video Games,Game Boy Color - Teal,"[The Game Boy is that small, colorful handheld...","[Video Games, Legacy Systems, Nintendo Systems...",134.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2362.0, 2..."
64353,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B001EYUQAU,4.0,1417968513000,19830,4450,Video Games,Blacksite: Area 51 - PC,[Modern day fears explode onto the streets of ...,"[Video Games, PC, Games]",24.66,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2362.0, 2..."


In [24]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
166738,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B00C1ZBFTW,0.0,1650511057038,19830,3610,Video Games,Shin Megami Tensei IV - Nintendo 3DS,"[Product Description, Demon Collection and Fus...","[Video Games, Legacy Systems, Nintendo Systems...",62.34,"[-1, -1, 2362, 2730, 2355, 3341, 4450, 1866, 4..."
751,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B0C3QFBBH5,3.0,1650511057038,19830,3590,Computers,"HyperX Cloud Stinger – Gaming Headset, Lightwe...",[HyperX Cloud Stinger is the ideal headset for...,"[Video Games, PC, Accessories, Headsets]",,"[-1, -1, 2362, 2730, 2355, 3341, 4450, 1866, 4..."
166739,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B00D5SZ04K,0.0,1650511163197,19830,309,Video Games,Need for Speed Rivals,"[Welcome to Redview County, where a street-rac...","[Video Games, Legacy Systems, PlayStation Syst...",20.11,"[-1, 2362, 2730, 2355, 3341, 4450, 1866, 4543,..."
752,AHCJXR7CTE2PZEUDZZTQ6RXX2WHQ,B077GG9D5D,5.0,1650511163197,19830,2154,Video Games,DualShock 4 Wireless Controller for PlayStatio...,[The DualShock 4 Wireless Controller features ...,"[Video Games, PlayStation 4, Accessories, Cont...",57.0,"[-1, 2362, 2730, 2355, 3341, 4450, 1866, 4543,..."


# Persist

In [25]:
full_df.to_parquet("../data/full_features_neg_sampling_df.parquet", index=False)

In [26]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)