In [1]:
import os
import pandas as pd
import numpy as np
from loguru import logger
from collections import defaultdict
from pydantic import BaseModel

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = '062-medium-rich-dataset'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "062-medium-rich-dataset",
  "notebook_persist_dp": "/home/dvquys/frostmourne/reco-algo/notebooks/data/062-medium-rich-dataset",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(interactions, columns=['user_indice', 'item_indice', args.rating_col, args.timestamp_col])

In [4]:
def generate_negative_samples(
    df,
    user_col='user_indice',
    item_col='item_indice',
    label_col='rating',
    neg_label=0,
    seed=None,
    progress_bar_type='tqdm'  # Options: 'tqdm', 'tqdm_notebook', None
):
    """
    Optimized function to generate negative samples for a user-item interaction DataFrame.
    """
    
    # Handle random seed
    if seed is not None:
        np.random.seed(seed)
    
    # Import tqdm based on the progress_bar_type
    if progress_bar_type == 'tqdm':
        try:
            from tqdm import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm is not installed. Please install it using 'pip install tqdm'.")
    elif progress_bar_type == 'tqdm_notebook':
        try:
            from tqdm.notebook import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm.notebook is not available. Please install it using 'pip install tqdm'.")
    elif progress_bar_type is None:
        # Define a dummy tqdm function that does nothing
        def tqdm_bar(iterable, **kwargs):
            return iterable
    else:
        raise ValueError("Invalid progress_bar_type. Choose 'tqdm', 'tqdm_notebook', or None.")
    
    # Calculate item popularity based on the number of interactions
    item_popularity = df[item_col].value_counts()
    
    # Define all unique items from the DataFrame
    items = item_popularity.index.values
    all_items_set = set(items)
    
    # Create a user-item interaction dictionary
    user_item_dict = df.groupby(user_col)[item_col].apply(set).to_dict()
    
    # Prepare items list and corresponding popularity array
    popularity = item_popularity.values.astype(np.float64)
    
    # Calculate sampling probabilities based on item popularity
    total_popularity = popularity.sum()
    if total_popularity == 0:
        sampling_probs = np.ones(len(items)) / len(items)
    else:
        sampling_probs = popularity / total_popularity
    
    # Create item to index mapping for quick access
    item_to_index = {item: idx for idx, item in enumerate(items)}
    
    # Initialize a list to store negative samples
    negative_samples = []
    
    # Initialize the progress bar
    total_users = len(user_item_dict)
    progress_bar = tqdm_bar(user_item_dict.items(), total=total_users, desc="Generating Negative Samples")
    
    for user, pos_items in progress_bar:
        num_pos = len(pos_items)
        
        # Identify items not interacted with by the user
        negative_candidates = all_items_set - pos_items
        num_neg_candidates = len(negative_candidates)
        
        if num_neg_candidates == 0:
            # User has interacted with all items, skip negative sampling
            continue
        
        # Determine the number of negative samples to generate
        num_neg = min(num_pos, num_neg_candidates)
        
        # Convert set to list for indexing
        negative_candidates_list = list(negative_candidates)
        
        # Get the indices and probabilities of negative candidates
        candidate_indices = [item_to_index[item] for item in negative_candidates_list]
        candidate_probs = sampling_probs[candidate_indices]
        candidate_probs /= candidate_probs.sum()
        
        # Sample negative items without replacement
        sampled_items = np.random.choice(
            negative_candidates_list, size=num_neg, replace=False, p=candidate_probs
        )
        
        # Append the sampled negative items to the list
        negative_samples.extend([(user, item) for item in sampled_items])
    
    # Convert negative samples to a DataFrame
    df_negative = pd.DataFrame(negative_samples, columns=[user_col, item_col])
    df_negative[label_col] = neg_label  # Assign label for negative samples
    
    return df_negative

def add_features_to_neg_df(pos_df, neg_df, user_col, timestamp_col, feature_cols=[]):
    neg_df = neg_df.assign(
        timestamp_pseudo=lambda df: df.groupby(user_col).cumcount() + 1
    )
    neg_df = (
        pd.merge(
            neg_df,
            pos_df.assign(
                timestamp_pseudo=lambda df: df.groupby([user_col])[timestamp_col].rank(method='first')   
            )[[user_col, timestamp_col, 'timestamp_pseudo', *feature_cols]],
            how='left',
            on=[user_col, 'timestamp_pseudo']
        )
        .drop(columns=['timestamp_pseudo'])
    )
    return neg_df

In [5]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [6]:
neg_df = generate_negative_samples(df, progress_bar_type='tqdm_notebook')
neg_df = add_features_to_neg_df(df, neg_df, 'user_indice', 'timestamp')

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
neg_df.sort_values(['user_indice', args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
1,1,105,0,2
2,1,106,0,4
3,2,102,0,1
4,2,106,0,2
5,3,102,0,1
6,3,104,0,5


# Load data

In [8]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [9]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-09-21 11:48:09.336[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=np.int64(1628641464793)[0m


In [10]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
0,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,B00001IVB4,5.0,942965209000,13388,117,Video Games,Sim Theme Park - PC,[],"[Video Games, PC, Games]",35.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,B00002NDRY,3.0,947856017000,18910,1353,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,8237,3536,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B001EYUWY0,5.0,952016747000,1745,1897,Video Games,Unreal Tournament - PlayStation 2,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, PlayStation Syst...",41.53,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B00001KUII,5.0,952143204000,1745,2692,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
944,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,B08F4C6HCD,5.0,1657816667680,16844,4379,Video Games,Legend of Zelda Link's Awakening - Nintendo Sw...,"[“Castaway, you should know the truth!” As Lin...","[Video Games, Nintendo Switch, Games]",59.88,"[794, 1998, 2908, 1118, 4245, 2180, 2873, 252,..."
945,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,B00DBDPOZ4,5.0,1657855227062,515,4013,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,"[-1, -1, -1, -1, -1, 3109, 2270, 1359, 2004, 3..."
946,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,13443,3173,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, 896, 2238, 4468, 4300, 3293, 3363..."
947,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B00ZJBSBD8,5.0,1657945454164,18390,229,Video Games,Trackmania Turbo-Nla,[Step into the wild car fantasy world of Track...,"[Video Games, PlayStation 4, Games]",13.68,"[3469, 3175, 189, 2344, 943, 4047, 1260, 4519,..."


In [11]:
features = ['item_sequence', 'user_id']

In [12]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
1,1,105,0,2
2,1,106,0,4
3,2,102,0,1
4,2,106,0,2
5,3,102,0,1
6,3,104,0,5


In [13]:
neg_df = generate_negative_samples(full_df, 'user_indice', 'item_indice', args.rating_col, neg_label=0, seed=args.random_seed, progress_bar_type='tqdm_notebook')
neg_ts_df = add_features_to_neg_df(full_df, neg_df, 'user_indice', args.timestamp_col, features)
neg_ts_df

Generating Negative Samples:   0%|          | 0/20366 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,1186,0,1417733039000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AG3CZLHCEZVWMLYTXDPTTROACL5A
1,0,250,0,1417733152000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AG3CZLHCEZVWMLYTXDPTTROACL5A
2,0,3135,0,1424813051000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AG3CZLHCEZVWMLYTXDPTTROACL5A
3,0,239,0,1424813061000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 336...",AG3CZLHCEZVWMLYTXDPTTROACL5A
4,0,543,0,1466101776000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3365.0, 3...",AG3CZLHCEZVWMLYTXDPTTROACL5A
...,...,...,...,...,...,...
170989,20365,550,0,1399911502000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1654.0, 3...",AEJR44S3KMHK6XEK5K2TV2NYDGPQ
170990,20365,677,0,1399911606000,"[-1.0, -1.0, -1.0, -1.0, -1.0, 1654.0, 3098.0,...",AEJR44S3KMHK6XEK5K2TV2NYDGPQ
170991,20365,3829,0,1399913039000,"[-1.0, -1.0, -1.0, -1.0, 1654.0, 3098.0, 3334....",AEJR44S3KMHK6XEK5K2TV2NYDGPQ
170992,20365,373,0,1399913381000,"[-1.0, -1.0, -1.0, 1654.0, 3098.0, 3334.0, 327...",AEJR44S3KMHK6XEK5K2TV2NYDGPQ


#### Join with features

In [14]:
not_meta_feature_cols = (args.user_col, 'user_indice', 'item_indice', args.rating_col, args.timestamp_col, *features)
meta_features = [col for col in full_df.columns if col not in not_meta_feature_cols]
meta_features

['parent_asin', 'main_category', 'title', 'description', 'categories', 'price']

In [15]:
neg_ts_df = (
    pd.merge(
        neg_ts_df,
        full_df[['item_indice', *meta_features]].drop_duplicates(subset=['item_indice']),
        how='left',
        on=['item_indice'],
        validate="m:1"
    )   
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin,main_category,title,description,categories,price
0,0,1186,0,1417733039000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AG3CZLHCEZVWMLYTXDPTTROACL5A,B07NZFD1T7,Video Games,Datel Action Replay Power Saves Pro - Nintendo...,[Power saves pro is your key to blow your game...,"[Video Games, Legacy Systems, Nintendo Systems...",29.97
1,0,250,0,1417733152000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AG3CZLHCEZVWMLYTXDPTTROACL5A,B01IC2A28C,Video Games,Pokémon Sun and Pokémon Moon Steelbook Dual Pa...,[Pokémon Sun and Pokémon Moon will launch on N...,"[Video Games, Legacy Systems, Nintendo Systems...",
2,0,3135,0,1424813051000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AG3CZLHCEZVWMLYTXDPTTROACL5A,B001CPFPJ8,Video Games,Singstar Microphones - PlayStation 2,[SingStar Microphones (wired) for Playstation 2],"[Video Games, Legacy Systems, PlayStation Syst...",189.99
3,0,239,0,1424813061000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 336...",AG3CZLHCEZVWMLYTXDPTTROACL5A,B0883P3VSH,Video Games,Turtle Beach Recon 50 Xbox Gaming Headset for ...,[Take gaming audio and comfort to the next lev...,"[Video Games, Legacy Systems, PlayStation Syst...",24.95
4,0,543,0,1466101776000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3365.0, 3...",AG3CZLHCEZVWMLYTXDPTTROACL5A,B00CES8EFY,Video Games,NBA 2K14 - Xbox 360,"[View larger, NBA 2K14, This year, 2K joins fo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",26.0
...,...,...,...,...,...,...,...,...,...,...,...,...
170989,20365,550,0,1399911502000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1654.0, 3...",AEJR44S3KMHK6XEK5K2TV2NYDGPQ,B015WCV6QM,Computers,Razer Diamondback - Chroma Ambidextrous Gaming...,[],"[Video Games, PC, Accessories, Gaming Mice]",59.99
170990,20365,677,0,1399911606000,"[-1.0, -1.0, -1.0, -1.0, -1.0, 1654.0, 3098.0,...",AEJR44S3KMHK6XEK5K2TV2NYDGPQ,B017VLXJ7G,Video Games,,[The Afterglow LVL Headset Adapter allows you ...,[],
170991,20365,3829,0,1399913039000,"[-1.0, -1.0, -1.0, -1.0, 1654.0, 3098.0, 3334....",AEJR44S3KMHK6XEK5K2TV2NYDGPQ,B01FWN9DOS,Video Games,Gran Turismo Sport - Limited Edition - PlaySta...,[Welcome to the future of motorsports - the de...,"[Video Games, PlayStation 4, Games]",61.85
170992,20365,373,0,1399913381000,"[-1.0, -1.0, -1.0, 1654.0, 3098.0, 3334.0, 327...",AEJR44S3KMHK6XEK5K2TV2NYDGPQ,B00AAQRNQ8,Cell Phones & Accessories,Compatible Cell Phone Adapter Replacement for ...,[],"[Video Games, PC]",


In [16]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]['item_indice']
assert len(set(neg_item_indices)) == 1, f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_df.loc[lambda df: df[args.item_col].eq(item)]['item_indice']
assert len(set(original_item_indices)) == 1, f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2024-09-21 11:48:22.994[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B00BLDJ1T6...[0m


In [17]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(frac=1, replace=False, random_state=args.random_seed)

In [18]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AEPV6L74QXWEH2DZGDL42UUBYV6A,B0B4CRTWGM,0.0,1417479397000,13282,2783,Video Games,Star Wars Knights of the Old Republic II: The ...,"[From the Manufacturer, Smug Statement: The Si...","[Video Games, Legacy Systems, Xbox Systems, Xb...",24.95,"[3355.0, 4276.0, 4032.0, 1909.0, 3803.0, 3713...."
65458,AECRQW3YB7HM3V37DAZD4UPPNO3A,B08P1NS2X1,0.0,1375928743000,7731,3958,Video Games,LEGO City Undercover - PlayStation 4,"[Join the Chase! In LEGO CITY Undercover, play...","[Video Games, PlayStation 4, Games]",19.74,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
79874,AERF6JFF76TH6FSCFQWL66ANOHVA,B001G6064C,0.0,1545403509538,9451,2678,Video Games,F.E.A.R. 2: Project Origin - Playstation 3,"[Product Description, Fear Alma Again, Amazon....","[Video Games, Legacy Systems, PlayStation Syst...",44.49,"[-1.0, -1.0, -1.0, 1037.0, 1650.0, 3445.0, 151..."
102744,AEAYIBNW4QQFLEC35Z7RQNZTUUOA,B005OGKBPE,0.0,1583917678434,12227,2545,Video Games,Syndicate - Origin PC [Online Game Code],"[From the Manufacturer, Syndicate is the re-im...","[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 290.0, 22..."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,15838,1218,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3674.0, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AFS7QJZZOXPJ4MDIXESEZ665TSJQ,B00XZQ58AI,0.0,1463416675000,9464,154,Video Games,NBA 2K16 - PlayStation 3,[The NBA 2K franchise is back with the most tr...,"[Video Games, Legacy Systems, PlayStation Syst...",59.68,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
46064,AG2MDQBZUVR647JELUYNZL3UYSFA,B017GY07L4,0.0,1516402184763,5512,1898,Video Games,Nights of Azure - PlayStation 4,[Nights of Azure is a tragic tale of two frien...,"[Video Games, PlayStation 4, Games]",49.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, 781.0, 2796.0, ..."
152474,AGNE5FJF5GSLINB5RZ45EJVUVKYQ,B000P5BSUQ,0.0,1398805150000,18179,706,Video Games,Age Of Mythology: Titans - PC,[With the Age Of Mythology and The Titans bund...,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,18559,1746,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 891..."


In [19]:
key_cols = [args.user_col, args.item_col, 'user_indice', 'item_indice', 'item_sequence', args.rating_col, args.timestamp_col]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [20]:
val_timestamp

np.int64(1628641464793)

In [21]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [22]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AEPV6L74QXWEH2DZGDL42UUBYV6A,B0B4CRTWGM,0.0,1417479397000,13282,2783,Video Games,Star Wars Knights of the Old Republic II: The ...,"[From the Manufacturer, Smug Statement: The Si...","[Video Games, Legacy Systems, Xbox Systems, Xb...",24.95,"[3355.0, 4276.0, 4032.0, 1909.0, 3803.0, 3713...."
65458,AECRQW3YB7HM3V37DAZD4UPPNO3A,B08P1NS2X1,0.0,1375928743000,7731,3958,Video Games,LEGO City Undercover - PlayStation 4,"[Join the Chase! In LEGO CITY Undercover, play...","[Video Games, PlayStation 4, Games]",19.74,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
79874,AERF6JFF76TH6FSCFQWL66ANOHVA,B001G6064C,0.0,1545403509538,9451,2678,Video Games,F.E.A.R. 2: Project Origin - Playstation 3,"[Product Description, Fear Alma Again, Amazon....","[Video Games, Legacy Systems, PlayStation Syst...",44.49,"[-1.0, -1.0, -1.0, 1037.0, 1650.0, 3445.0, 151..."
102744,AEAYIBNW4QQFLEC35Z7RQNZTUUOA,B005OGKBPE,0.0,1583917678434,12227,2545,Video Games,Syndicate - Origin PC [Online Game Code],"[From the Manufacturer, Syndicate is the re-im...","[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 290.0, 22..."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,15838,1218,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3674.0, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AFS7QJZZOXPJ4MDIXESEZ665TSJQ,B00XZQ58AI,0.0,1463416675000,9464,154,Video Games,NBA 2K16 - PlayStation 3,[The NBA 2K franchise is back with the most tr...,"[Video Games, Legacy Systems, PlayStation Syst...",59.68,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
46064,AG2MDQBZUVR647JELUYNZL3UYSFA,B017GY07L4,0.0,1516402184763,5512,1898,Video Games,Nights of Azure - PlayStation 4,[Nights of Azure is a tragic tale of two frien...,"[Video Games, PlayStation 4, Games]",49.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, 781.0, 2796.0, ..."
152474,AGNE5FJF5GSLINB5RZ45EJVUVKYQ,B000P5BSUQ,0.0,1398805150000,18179,706,Video Games,Age Of Mythology: Titans - PC,[With the Age Of Mythology and The Titans bund...,"[Video Games, PC, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,18559,1746,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 891..."


In [23]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
19465,AEFWYBITAJIQEAGJMGBBZQPD246Q,B001EYUS7G,0.0,1650810855155,2377,2080,Video Games,Far Cry 2: Fortune's Edition | PC Code - Ubiso...,"[Product Description, Includes Game + Fortune'...","[Video Games, Legacy Systems, PlayStation Syst...",,"[-1, -1, -1, -1, 2044, 1400, 4253, 3448, 3402,..."
59024,AEXN3VFNZS7CKHX2NHDHLYDBZZIQ,B002CZ38KA,0.0,1633099443693,6987,2376,Video Games,Heavy Rain - Greatest Hits,"[Product Description, Experience a gripping ps...","[Video Games, Legacy Systems, PlayStation Syst...",7.66,"[-1, 3431, 2128, 1144, 2691, 303, 3974, 3175, ..."
63732,AGCYZBKXV6Q5BGHWJB7J7D2HRWSA,B09R21G9DL,0.0,1640957371979,7520,4611,Computers,"Cipon Gamecube Controller, Wired Controller Ga...",[],"[Video Games, Legacy Systems, Nintendo Systems...",17.99,"[-1, -1, -1, -1, 1103, 2459, 750, 673, 2850, 3..."
74,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B0BLFYF8K2,4.0,1630263342566,9303,4165,Computers,"Logitech G600 MMO Gaming Mouse, RGB Backlit, 2...","[With 20 buttons, the Logitech G600 MMO Gaming...","[Video Games, PC, Accessories, Gaming Mice]",37.99,"[1829, 1711, 3115, 1930, 1657, 4651, 1579, 250..."
892,AFFPVZ3JNCTQIKAK4XK37E2ENWWA,B00HVBPRUO,4.0,1655428133046,6775,2216,Video Games,Gold Wireless Stereo Headset - PlayStation 4,[A Headset for Gamers: Experience everything f...,"[Video Games, PlayStation 4, Accessories, Head...",,"[-1, -1, 4399, 3877, 1233, 3713, 2050, 3803, 2..."
...,...,...,...,...,...,...,...,...,...,...,...,...
532,AFUWPAK6VCGEL2OVIL2YGZNFQJZQ,B08N6NCR3Q,4.0,1642699950266,3144,4617,Video Games,Thrustmaster T 16000M SPACE SIM DUO STICK (PC),[The THRUSTMASTER T.16000M FCS Space Sim Duo c...,"[Video Games, PC, Accessories, Controllers, Fl...",119.51,"[-1, -1, -1, -1, 3648, 3017, 4093, 3173, 4263,..."
84979,AEPOQDJZJCF5APANNFRSABUNU4IA,B07G3KB7RT,0.0,1643422574208,10070,200,Video Games,Satisfye – ZenGrip Pro Gen 3 OLED Elite Bundle...,[],"[Video Games, Nintendo Switch, Accessories, Ha...",89.99,"[3808, 1356, 638, 3934, 495, 4213, 2717, 1721,..."
815,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,13283,3456,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,"[-1, 1999, 1652, 2454, 2557, 1334, 129, 2409, ..."
280,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,15033,3058,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,"[-1, -1, -1, -1, -1, 2884, 1953, 1724, 3591, 1..."


# Checks

In [24]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(args.timestamp_col)
assert check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2, "Number of pos and neg samples are not equal"

[32m2024-09-21 11:48:23.292[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AGRHKDNSRJ3CT5ST75KGSCD4WA5A...[0m


In [25]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(args.timestamp_col)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5, f"Item {item} does not appear much in training data"

[32m2024-09-21 11:48:23.311[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mChecking item B087NMYQYG...[0m


## Random eye-ball

In [26]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
86745,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B001FWK340,5.0,1442602191000,17102,1467,Cell Phones & Accessories,dreamGEAR PS3 Quad Charging Dock charges up to...,[],"[Video Games, Legacy Systems, PlayStation Syst...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
143500,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B002EQFXZA,0.0,1442602191000,17102,584,Video Games,PSP UMD Case,[Lightweight and durable protective case for S...,"[Video Games, Legacy Systems, PlayStation Syst...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
90667,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B00HVBPRUO,5.0,1448008098000,17102,2216,Video Games,Gold Wireless Stereo Headset - PlayStation 4,[A Headset for Gamers: Experience everything f...,"[Video Games, PlayStation 4, Accessories, Head...",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
143501,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B00ZHQ39F0,0.0,1448008098000,17102,4515,Video Games,Just Dance 2016 - PlayStation 4,[Introducing Just Dance 2016 – the newest game...,"[Video Games, PlayStation 4, Games]",28.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
143502,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B00K5HTPR2,0.0,1448987770000,17102,3530,Video Games,NBA 2K15 - PlayStation 3,"[Nominated for 70 'Game of the Year' Awards, t...","[Video Games, PlayStation 4, Games]",26.64,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
167843,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B0BL65X86R,5.0,1613496849526,17102,32,Video Games,$25 PlayStation Store Gift Card [Digital Code],[Redeem against anything on PlayStation Store....,"[Video Games, Online Game Services, PlayStatio...",25.0,"[1362.0, 366.0, 116.0, 2190.0, 886.0, 871.0, 1..."
143538,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B002I096AA,0.0,1617462149459,17102,3972,Video Games,Nintendo 3DS - Cosmo Black,"[Product Description, Play games in 3D without...","[Video Games, Legacy Systems, Nintendo Systems...",249.79,"[366.0, 116.0, 2190.0, 886.0, 871.0, 1957.0, 3..."
168523,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B00OAYHIRA,5.0,1617462149459,17102,1146,Computers,ZD-V+ USB Wired Gaming Controller Gamepad For ...,[],"[Video Games, PC, Accessories, Controllers, Ga...",19.99,"[366.0, 116.0, 2190.0, 886.0, 871.0, 1957.0, 3..."
143539,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B00O9GW8TC,0.0,1617462932353,17102,1916,Video Games,Nintendo Super Smash Bros. White Classic Gamec...,[Many Super Smash Bros. fans grew up playing S...,"[Video Games, Legacy Systems, Nintendo Systems...",213.17,"[116.0, 2190.0, 886.0, 871.0, 1957.0, 323.0, 2..."


In [27]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
389,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B087NMYQYG,5.0,1639132624893,17102,3763,Video Games,Super Mario Party - Nintendo Switch,[Turn the tables on opponents as you race acro...,"[Video Games, Nintendo Switch, Games]",52.33,"[2190, 886, 871, 1957, 323, 2590, 142, 32, 114..."
143540,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B087SHFL9B,0.0,1639132624893,17102,3116,Video Games,Super Mario Odyssey - Nintendo Switch,"[Embark on a captivating, globe-trotting adven...","[Video Games, Nintendo Switch, Games]",53.98,"[2190, 886, 871, 1957, 323, 2590, 142, 32, 114..."
143541,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B09KTV12WR,0.0,1639142097998,17102,1143,Video Games,Star Wars Knights of the Old Republic - Xbox,"[Product Description, Star Wars role playing e...","[Video Games, Legacy Systems, Xbox Systems, Xb...",30.0,"[886, 871, 1957, 323, 2590, 142, 32, 1146, 143..."
390,AGRHKDNSRJ3CT5ST75KGSCD4WA5A,B07V8YSBFG,5.0,1639142097998,17102,1349,Video Games,"Roblox Digital Gift Code for 1,200 Robux [Rede...",[],"[Video Games, Digital for the Holidays, Digita...",15.0,"[886, 871, 1957, 323, 2590, 142, 32, 1146, 143..."


# Persist

In [28]:
full_df.to_parquet('../data/full_features_neg_sampling_df.parquet', index=False)

In [29]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)