In [1]:
import os
import pandas as pd
import numpy as np
from loguru import logger
from collections import defaultdict
from pydantic import BaseModel

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = '056-small-rich-dataset'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "056-small-rich-dataset",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/056-small-rich-dataset",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(interactions, columns=['user_indice', 'item_indice', args.rating_col, args.timestamp_col])

In [4]:
def generate_negative_samples(
    df,
    user_col='user_indice',
    item_col='item_indice',
    label_col='rating',
    neg_label=0,
    seed=None,
    progress_bar_type='tqdm'  # Options: 'tqdm', 'tqdm_notebook', None
):
    """
    Optimized function to generate negative samples for a user-item interaction DataFrame.
    """
    
    # Handle random seed
    if seed is not None:
        np.random.seed(seed)
    
    # Import tqdm based on the progress_bar_type
    if progress_bar_type == 'tqdm':
        try:
            from tqdm import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm is not installed. Please install it using 'pip install tqdm'.")
    elif progress_bar_type == 'tqdm_notebook':
        try:
            from tqdm.notebook import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm.notebook is not available. Please install it using 'pip install tqdm'.")
    elif progress_bar_type is None:
        # Define a dummy tqdm function that does nothing
        def tqdm_bar(iterable, **kwargs):
            return iterable
    else:
        raise ValueError("Invalid progress_bar_type. Choose 'tqdm', 'tqdm_notebook', or None.")
    
    # Calculate item popularity based on the number of interactions
    item_popularity = df[item_col].value_counts()
    
    # Define all unique items from the DataFrame
    items = item_popularity.index.values
    all_items_set = set(items)
    
    # Create a user-item interaction dictionary
    user_item_dict = df.groupby(user_col)[item_col].apply(set).to_dict()
    
    # Prepare items list and corresponding popularity array
    popularity = item_popularity.values.astype(np.float64)
    
    # Calculate sampling probabilities based on item popularity
    total_popularity = popularity.sum()
    if total_popularity == 0:
        sampling_probs = np.ones(len(items)) / len(items)
    else:
        sampling_probs = popularity / total_popularity
    
    # Create item to index mapping for quick access
    item_to_index = {item: idx for idx, item in enumerate(items)}
    
    # Initialize a list to store negative samples
    negative_samples = []
    
    # Initialize the progress bar
    total_users = len(user_item_dict)
    progress_bar = tqdm_bar(user_item_dict.items(), total=total_users, desc="Generating Negative Samples")
    
    for user, pos_items in progress_bar:
        num_pos = len(pos_items)
        
        # Identify items not interacted with by the user
        negative_candidates = all_items_set - pos_items
        num_neg_candidates = len(negative_candidates)
        
        if num_neg_candidates == 0:
            # User has interacted with all items, skip negative sampling
            continue
        
        # Determine the number of negative samples to generate
        num_neg = min(num_pos, num_neg_candidates)
        
        # Convert set to list for indexing
        negative_candidates_list = list(negative_candidates)
        
        # Get the indices and probabilities of negative candidates
        candidate_indices = [item_to_index[item] for item in negative_candidates_list]
        candidate_probs = sampling_probs[candidate_indices]
        candidate_probs /= candidate_probs.sum()
        
        # Sample negative items without replacement
        sampled_items = np.random.choice(
            negative_candidates_list, size=num_neg, replace=False, p=candidate_probs
        )
        
        # Append the sampled negative items to the list
        negative_samples.extend([(user, item) for item in sampled_items])
    
    # Convert negative samples to a DataFrame
    df_negative = pd.DataFrame(negative_samples, columns=[user_col, item_col])
    df_negative[label_col] = neg_label  # Assign label for negative samples
    
    return df_negative

def add_features_to_neg_df(pos_df, neg_df, user_col, timestamp_col, feature_cols=[]):
    neg_df = neg_df.assign(
        timestamp_pseudo=lambda df: df.groupby(user_col).cumcount() + 1
    )
    neg_df = (
        pd.merge(
            neg_df,
            pos_df.assign(
                timestamp_pseudo=lambda df: df.groupby([user_col])[timestamp_col].rank(method='first')   
            )[[user_col, timestamp_col, 'timestamp_pseudo', *feature_cols]],
            how='left',
            on=[user_col, 'timestamp_pseudo']
        )
        .drop(columns=['timestamp_pseudo'])
    )
    return neg_df

In [5]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [6]:
neg_df = generate_negative_samples(df, progress_bar_type='tqdm_notebook')
neg_df = add_features_to_neg_df(df, neg_df, 'user_indice', 'timestamp')

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
neg_df.sort_values(['user_indice', args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
1,1,105,0,2
2,1,106,0,4
3,2,105,0,1
4,2,106,0,2
5,3,103,0,1
6,3,102,0,5


# Load data

In [8]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [9]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-09-20 22:03:19.312[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=np.int64(1628641464793)[0m


In [10]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
0,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,B00001IVB4,5.0,942965209000,Video Games,Sim Theme Park - PC,[],"[Video Games, PC, Games]",35.0,9784,4186,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,B00002NDRY,3.0,947856017000,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,14535,791,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,19437,4400,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B001EYUWY0,5.0,952016747000,Video Games,Unreal Tournament - PlayStation 2,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, PlayStation Syst...",41.53,11393,2433,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B00001KUII,5.0,952143204000,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,11393,3027,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
944,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,B08F4C6HCD,5.0,1657816667680,Video Games,Legend of Zelda Link's Awakening - Nintendo Sw...,"[“Castaway, you should know the truth!” As Lin...","[Video Games, Nintendo Switch, Games]",59.88,9944,3714,"[1815, 4445, 4013, 1742, 4130, 1565, 1252, 945..."
945,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,B00DBDPOZ4,5.0,1657855227062,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,11860,2099,"[-1, -1, -1, -1, -1, 3122, 4550, 2203, 1331, 1..."
946,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,18592,3726,"[-1, -1, -1, 3575, 3331, 2684, 4352, 3393, 205..."
947,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B00ZJBSBD8,5.0,1657945454164,Video Games,Trackmania Turbo-Nla,[Step into the wild car fantasy world of Track...,"[Video Games, PlayStation 4, Games]",13.68,2133,4298,"[4328, 4344, 3319, 2180, 2315, 3243, 1797, 979..."


In [11]:
features = ['item_sequence', 'user_id']

In [12]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
1,1,105,0,2
2,1,106,0,4
3,2,105,0,1
4,2,106,0,2
5,3,103,0,1
6,3,102,0,5


In [13]:
neg_df = generate_negative_samples(full_df, 'user_indice', 'item_indice', args.rating_col, neg_label=0, seed=args.random_seed, progress_bar_type='tqdm_notebook')
neg_ts_df = add_features_to_neg_df(full_df, neg_df, 'user_indice', args.timestamp_col, features)
neg_ts_df

Generating Negative Samples:   0%|          | 0/20366 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,1135,0,1374237526000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEDCACWUHRUE7RWJY2HK2EH2X4OQ
1,0,207,0,1374237587000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEDCACWUHRUE7RWJY2HK2EH2X4OQ
2,0,3066,0,1379850750000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEDCACWUHRUE7RWJY2HK2EH2X4OQ
3,0,194,0,1379850880000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 394...",AEDCACWUHRUE7RWJY2HK2EH2X4OQ
4,0,544,0,1379851002000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3943.0, 5...",AEDCACWUHRUE7RWJY2HK2EH2X4OQ
...,...,...,...,...,...,...
170989,20365,1958,0,1296916877000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AG6BX3MK5UPMSDX4XL52AVTIDKKA
170990,20365,422,0,1298302836000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 306...",AG6BX3MK5UPMSDX4XL52AVTIDKKA
170991,20365,1516,0,1298459256000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3065.0, 2...",AG6BX3MK5UPMSDX4XL52AVTIDKKA
170992,20365,2994,0,1311155849000,"[-1.0, -1.0, -1.0, -1.0, -1.0, 3065.0, 2720.0,...",AG6BX3MK5UPMSDX4XL52AVTIDKKA


#### Join with features

In [14]:
not_meta_feature_cols = (args.user_col, 'user_indice', 'item_indice', args.rating_col, args.timestamp_col, *features)
meta_features = [col for col in full_df.columns if col not in not_meta_feature_cols]
meta_features

['parent_asin', 'main_category', 'title', 'description', 'categories', 'price']

In [15]:
neg_ts_df = (
    pd.merge(
        neg_ts_df,
        full_df[['item_indice', *meta_features]].drop_duplicates(subset=['item_indice']),
        how='left',
        on=['item_indice'],
        validate="m:1"
    )   
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin,main_category,title,description,categories,price
0,0,1135,0,1374237526000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEDCACWUHRUE7RWJY2HK2EH2X4OQ,B0044DE88O,,Top Spin 4 - Playstation 3,"[Product Description, The latest iteration fro...","[Video Games, Legacy Systems, PlayStation Syst...",41.47
1,0,207,0,1374237587000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEDCACWUHRUE7RWJY2HK2EH2X4OQ,B000B6ML0U,Video Games,Perfect Dark Zero,"[From famed game developer Rare LTD., Perfect ...","[Video Games, Legacy Systems, Xbox Systems, Xb...",20.31
2,0,3066,0,1379850750000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AEDCACWUHRUE7RWJY2HK2EH2X4OQ,B001ELJEJM,Video Games,GameCube Console - Legend of Zelda Bundle - In...,"[Product description, The nintendo gamecube, A...","[Video Games, Legacy Systems, Nintendo Systems...",
3,0,194,0,1379850880000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 394...",AEDCACWUHRUE7RWJY2HK2EH2X4OQ,B087SLTR2B,Video Games,NBA 2K20 Legend Edition Xbox One,[NBA 2K has evolved into much more than a bask...,"[Video Games, Xbox One, Games]",18.99
4,0,544,0,1379851002000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3943.0, 5...",AEDCACWUHRUE7RWJY2HK2EH2X4OQ,B014R4KYMS,Video Games,Uncharted 4: A Thief's End - PlayStation 4,[Uncharted comes to the PlayStation 4.Uncharte...,"[Video Games, PlayStation 4, Games]",24.99
...,...,...,...,...,...,...,...,...,...,...,...,...
170989,20365,1958,0,1296916877000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AG6BX3MK5UPMSDX4XL52AVTIDKKA,B01I59OC92,Video Games,Dragon Ball Xenoverse 2 - Xbox One Day One Edi...,[Developed to fully utilize the power of curre...,"[Video Games, Xbox One, Downloadable Content]",16.71
170990,20365,422,0,1298302836000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 306...",AG6BX3MK5UPMSDX4XL52AVTIDKKA,B003JBHG90,Video Games,LEGO Star Wars III: The Clone Wars,[The beloved and critically acclaimed LEGO Sta...,"[Video Games, Legacy Systems, Nintendo Systems...",32.0
170991,20365,1516,0,1298459256000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3065.0, 2...",AG6BX3MK5UPMSDX4XL52AVTIDKKA,B00EN9Q8G4,,Forza Motorsport 5,"[Product description, *The DLC (Downloadable C...","[Video Games, Xbox One, Games]",64.98
170992,20365,2994,0,1311155849000,"[-1.0, -1.0, -1.0, -1.0, -1.0, 3065.0, 2720.0,...",AG6BX3MK5UPMSDX4XL52AVTIDKKA,B09CB6ZXD7,Video Games,UniKeep Game Case for Nintendo Switch Cartridg...,[The UniKeep cartridge game case for the Ninte...,"[Video Games, Nintendo Switch, Accessories, Ca...",21.99


In [16]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]['item_indice']
assert len(set(neg_item_indices)) == 1, f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_df.loc[lambda df: df[args.item_col].eq(item)]['item_indice']
assert len(set(original_item_indices)) == 1, f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2024-09-20 22:03:28.616[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B00KVP76G0...[0m


In [17]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(frac=1, replace=False, random_state=args.random_seed)

In [18]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
111519,AFSZEZS5KHRDHLP5ZD4IDCCBMJYQ,B07DPK5NPD,0.0,1320083109000,Video Games,Super Smash Bros. - Nintendo Wii U,[Battle it out as Nintendo's greatest heroes o...,"[Video Games, Legacy Systems, Nintendo Systems...",39.85,13195,1901,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
65458,AGREN2RJTIPXMXHWAOMIEGDFHZPA,B01FDA42PU,0.0,1552970538339,Video Games,"Carry Case for Nintendo New 2DS XL/New 3DS XL,...",[],"[Video Games, Legacy Systems, Nintendo Systems...",,7792,1647,"[1519.0, 1564.0, 4468.0, 2958.0, 252.0, 3867.0..."
79874,AHKR6VDHTSAFNLSYN7SSKMSUSAXQ,B001EYUX1W,0.0,1445897840000,Video Games,Virtua Fighter 5 - Playstation 3,[Virtua Fighter returns with the next release ...,"[Video Games, Legacy Systems, PlayStation Syst...",37.0,9537,3221,"[1206.0, 4305.0, 4192.0, 3503.0, 452.0, 2396.0..."
102744,AF2766AORO3SS27FM3KBI543BAUQ,B07RQP5S8C,0.0,1241381283000,Cell Phones & Accessories,"Lamicall Adjustable Phone Tablet Stand, Playst...",[],"[Video Games, Nintendo Switch, Accessories, Mo...",15.99,12146,2976,"[-1.0, -1.0, -1.0, -1.0, 2875.0, 4046.0, 4036...."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,15572,2578,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3957.0, 3..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AFLEWAWJTWCBAWSW4Q2Y2JD3NS4A,B002BS47TE,0.0,1476584201000,Video Games,Metal Gear Solid Peace Walker,"[Product Description, Stories tell of a legend...","[Video Games, Legacy Systems, PlayStation Syst...",55.96,9548,140,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
46064,AHAZRPFU3P6RE6CK2CB5X6A2AQGA,B07N5LL4YW,0.0,1480928712000,Video Games,Knack (PlayStation 4),"[From the Manufacturer, An Unlikely Hero, Mank...","[Video Games, PlayStation 4, Games]",22.68,5565,1396,"[-1.0, -1.0, -1.0, -1.0, 1104.0, 1513.0, 1228...."
152474,AEE5TEYNQCMI62LBMOP2AIKXPCRA,B007YZCE94,0.0,1358478392000,Video Games,LEGO Batman 2: DC Super Heroes - Nintendo 3DS,"[Product Description, The Dynamic Duo returns ...","[Video Games, Legacy Systems, Nintendo Systems...",28.05,18137,2338,"[-1.0, -1.0, -1.0, -1.0, -1.0, 1534.0, 996.0, ..."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,4979,2263,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 190..."


In [19]:
key_cols = [args.user_col, args.item_col, 'user_indice', 'item_indice', 'item_sequence', args.rating_col, args.timestamp_col]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [20]:
val_timestamp

np.int64(1628641464793)

In [21]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [22]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
111519,AFSZEZS5KHRDHLP5ZD4IDCCBMJYQ,B07DPK5NPD,0.0,1320083109000,Video Games,Super Smash Bros. - Nintendo Wii U,[Battle it out as Nintendo's greatest heroes o...,"[Video Games, Legacy Systems, Nintendo Systems...",39.85,13195,1901,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
65458,AGREN2RJTIPXMXHWAOMIEGDFHZPA,B01FDA42PU,0.0,1552970538339,Video Games,"Carry Case for Nintendo New 2DS XL/New 3DS XL,...",[],"[Video Games, Legacy Systems, Nintendo Systems...",,7792,1647,"[1519.0, 1564.0, 4468.0, 2958.0, 252.0, 3867.0..."
79874,AHKR6VDHTSAFNLSYN7SSKMSUSAXQ,B001EYUX1W,0.0,1445897840000,Video Games,Virtua Fighter 5 - Playstation 3,[Virtua Fighter returns with the next release ...,"[Video Games, Legacy Systems, PlayStation Syst...",37.0,9537,3221,"[1206.0, 4305.0, 4192.0, 3503.0, 452.0, 2396.0..."
102744,AF2766AORO3SS27FM3KBI543BAUQ,B07RQP5S8C,0.0,1241381283000,Cell Phones & Accessories,"Lamicall Adjustable Phone Tablet Stand, Playst...",[],"[Video Games, Nintendo Switch, Accessories, Mo...",15.99,12146,2976,"[-1.0, -1.0, -1.0, -1.0, 2875.0, 4046.0, 4036...."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,15572,2578,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3957.0, 3..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AFLEWAWJTWCBAWSW4Q2Y2JD3NS4A,B002BS47TE,0.0,1476584201000,Video Games,Metal Gear Solid Peace Walker,"[Product Description, Stories tell of a legend...","[Video Games, Legacy Systems, PlayStation Syst...",55.96,9548,140,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
46064,AHAZRPFU3P6RE6CK2CB5X6A2AQGA,B07N5LL4YW,0.0,1480928712000,Video Games,Knack (PlayStation 4),"[From the Manufacturer, An Unlikely Hero, Mank...","[Video Games, PlayStation 4, Games]",22.68,5565,1396,"[-1.0, -1.0, -1.0, -1.0, 1104.0, 1513.0, 1228...."
152474,AEE5TEYNQCMI62LBMOP2AIKXPCRA,B007YZCE94,0.0,1358478392000,Video Games,LEGO Batman 2: DC Super Heroes - Nintendo 3DS,"[Product Description, The Dynamic Duo returns ...","[Video Games, Legacy Systems, Nintendo Systems...",28.05,18137,2338,"[-1.0, -1.0, -1.0, -1.0, -1.0, 1534.0, 996.0, ..."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,4979,2263,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 190..."


In [23]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
74,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B0BLFYF8K2,4.0,1630263342566,Computers,"Logitech G600 MMO Gaming Mouse, RGB Backlit, 2...","[With 20 buttons, the Logitech G600 MMO Gaming...","[Video Games, PC, Accessories, Gaming Mice]",37.99,8046,2332,"[1614, 3373, 970, 912, 455, 572, 1904, 4, 3467..."
892,AFFPVZ3JNCTQIKAK4XK37E2ENWWA,B00HVBPRUO,4.0,1655428133046,Video Games,Gold Wireless Stereo Headset - PlayStation 4,[A Headset for Gamers: Experience everything f...,"[Video Games, PlayStation 4, Accessories, Head...",,19677,621,"[-1, -1, 4649, 2405, 2655, 579, 3089, 3955, 44..."
26663,AG7A42N537XQMGC5URJ6VZ2JO2FA,B00J48C36S,0.0,1634619046379,Video Games,Assassin's Creed Unity - Xbox One,"[Paris, 1789. The French Revolution turns a on...","[Video Games, Xbox One, Games]",23.5,3240,662,"[-1, -1, 2006, 3075, 2589, 1270, 421, 1524, 15..."
285,AFBRTNVOROW7UVA66UPX5YCFC6MQ,B07YBXFDYK,3.0,1636189764550,Video Games,The Evil Within 2 - PlayStation 4,"[From Shinji Mikami, The Evil Within 2 takes t...","[Video Games, PlayStation 4, Games]",20.98,12758,4410,"[-1, -1, -1, -1, 4407, 378, 3920, 831, 1594, 2..."
7221,AEIS45LEWNPTLJCOLT2NPJ4NFZWQ,B081243BT6,0.0,1637350483061,Cell Phones & Accessories,Orzly Carrying case for Nintendo Switch OLED a...,[],"[Video Games, Nintendo Switch, Accessories, Ca...",29.99,846,2216,"[991, 1867, 3847, 635, 933, 4635, 3781, 154, 4..."
...,...,...,...,...,...,...,...,...,...,...,...,...
532,AFUWPAK6VCGEL2OVIL2YGZNFQJZQ,B08N6NCR3Q,4.0,1642699950266,Video Games,Thrustmaster T 16000M SPACE SIM DUO STICK (PC),[The THRUSTMASTER T.16000M FCS Space Sim Duo c...,"[Video Games, PC, Accessories, Controllers, Fl...",119.51,19807,1968,"[-1, -1, -1, -1, 628, 2575, 892, 3726, 1213, 249]"
17892,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B0C3KJJ6XS,0.0,1657945454164,Computers,Razer Nari Ultimate Wireless 7.1 Surround Soun...,[Razer Nari Ultimate: 2.4GHz wireless PC gamin...,"[Video Games, PC, Accessories, Headsets]",100.42,2133,3530,"[4328, 4344, 3319, 2180, 2315, 3243, 1797, 979..."
815,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,11918,1339,"[-1, 1127, 575, 2328, 2637, 4378, 3947, 4223, ..."
280,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,11328,1289,"[-1, -1, -1, -1, -1, 3308, 3848, 2264, 2084, 1..."


# Checks

In [24]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(args.timestamp_col)
assert check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2, "Number of pos and neg samples are not equal"

[32m2024-09-20 22:03:28.751[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AECVJHLMDSTXBDMDZWEYHNWOGDTQ...[0m


In [25]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(args.timestamp_col)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5, f"Item {item} does not appear much in training data"

[32m2024-09-20 22:03:28.763[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mChecking item B09GM4283G...[0m


## Random eye-ball

In [31]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
24194,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B000038IFX,5.0,1336153255000,Video Games,Xenogears - PlayStation,"[Product description, A mysterious organizatio...","[Video Games, Legacy Systems, PlayStation Syst...",239.98,17569,2170,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
147972,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B00269QLI8,0.0,1336153255000,Video Games,Call of Duty Modern Warfare 2 - Xbox 360,"[Clear your schedule, the most anticipated gam...","[Video Games, Legacy Systems, Xbox Systems, Xb...",31.0,17569,3457,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
38320,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B0045U01OG,5.0,1376332282000,Video Games,Grand Theft Auto IV & Episodes from Liberty Ci...,[What does the American dream mean today? For ...,"[Video Games, Legacy Systems, PlayStation Syst...",49.95,17569,3988,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
147973,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B00I5LAGM8,0.0,1376332282000,Video Games,Metal Gear Solid V: Ground Zeroes - PlayStatio...,[World-renowned Kojima Productions showcases t...,"[Video Games, Legacy Systems, PlayStation Syst...",34.93,17569,3020,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
147974,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B00CXTX2YW,0.0,1376332349000,Video Games,Xbox 360 Wireless Controller - Camouflage,[Ambush your opponents with the Xbox 360 Speci...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",40.99,17569,1935,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
38321,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B005OGPTSS,5.0,1376332349000,Video Games,Red Dead Redemption: Game of the Year Edition ...,"[Red Dead Redemption is a Western epic, set at...","[Video Games, Legacy Systems, Xbox Systems, Xb...",28.99,17569,2196,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
147975,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B07MGMKCS8,0.0,1377702950000,Video Games,Nintendo Selects: Yoshi's New Island,[Nintendo Selects highlights a variety of grea...,"[Video Games, Legacy Systems, Nintendo Systems...",34.99,17569,3645,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 217..."
38841,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B008CP6RWU,5.0,1377702950000,Video Games,PS3 Journey Collection,"[Product Description, The Journey Collector’s ...","[Video Games, Legacy Systems, PlayStation Syst...",28.99,17569,4188,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 217..."
79751,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B00L4SD1F8,5.0,1433276290000,Video Games,Witcher 3: Wild Hunt Complete Edition - PC,[Become a professional monster slayer and emba...,"[Video Games, PC, Games]",39.97,17569,2966,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2170.0, 3..."
147976,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B00CTKHXFO,0.0,1433276290000,Video Games,Metal Gear Solid: The Legacy Collection,[From critically acclaimed director Hideo Koji...,"[Video Games, Legacy Systems, PlayStation Syst...",119.97,17569,1103,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2170.0, 3..."


In [32]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
250,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B09GM4283G,5.0,1634842797111,Video Games,PlayStation PULSE 3D Wireless Headset – Midnig...,[Ignite your gaming nights with the ultra-slee...,"[Video Games, PlayStation 5, Accessories, Gami...",99.0,17569,1146,"[-1, -1, -1, -1, 2170, 3988, 2196, 4188, 2966,..."
147978,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B06XSZDFKX,0.0,1634842797111,Computers,MAYFLASH W010 Wireless Sensor Dolphinbar for P...,[],"[Video Games, Legacy Systems, Nintendo Systems...",29.99,17569,4219,"[-1, -1, -1, -1, 2170, 3988, 2196, 4188, 2966,..."
147979,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B073XJX6GW,0.0,1634842858472,Video Games,Pillars of Eternity: Complete Edition - PlaySt...,[Every action has a consequence! Pillars of E...,"[Video Games, PlayStation 4, Games]",41.97,17569,2423,"[-1, -1, -1, 2170, 3988, 2196, 4188, 2966, 829..."
251,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B094YHB1QK,5.0,1634842858472,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,17569,1339,"[-1, -1, -1, 2170, 3988, 2196, 4188, 2966, 829..."
252,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B08FC6Y4VG,5.0,1634842902687,Video Games,Playstation DualSense Charging Station,[Charge up to two DualSense wireless controlle...,"[Video Games, PlayStation 5, Accessories, Batt...",29.99,17569,1005,"[-1, -1, 2170, 3988, 2196, 4188, 2966, 829, 11..."
147980,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B001E91OP6,0.0,1634842902687,Video Games,Rogue Warrior - PC,[Rogue Warrior is a story-driven shooter that ...,"[Video Games, PC, Games]",1.49,17569,1303,"[-1, -1, 2170, 3988, 2196, 4188, 2966, 829, 11..."
253,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B0872XQSBK,5.0,1634842936892,Video Games,Persona 5 Royal: Phantom Thieves,[Prepare for an all-new RPG experience in Pers...,"[Video Games, PlayStation 4, Games]",249.49,17569,1447,"[-1, 2170, 3988, 2196, 4188, 2966, 829, 1146, ..."
147981,AECVJHLMDSTXBDMDZWEYHNWOGDTQ,B004L4AZ7Y,0.0,1634842936892,Video Games,Rayman 3D - Nintendo 3DS,"[Product Description, Rayman, one of Ubisoft's...","[Video Games, Legacy Systems, Nintendo Systems...",,17569,2724,"[-1, 2170, 3988, 2196, 4188, 2966, 829, 1146, ..."


# Persist

In [27]:
full_df.to_parquet('../data/full_features_neg_sampling_df.parquet', index=False)

In [28]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)