In [1]:
import os
import pandas as pd
import numpy as np
from loguru import logger
from collections import defaultdict
from pydantic import BaseModel

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = '062-medium-rich-dataset'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "062-medium-rich-dataset",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/062-medium-rich-dataset",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(interactions, columns=['user_indice', 'item_indice', args.rating_col, args.timestamp_col])

In [4]:
def generate_negative_samples(
    df,
    user_col='user_indice',
    item_col='item_indice',
    label_col='rating',
    neg_label=0,
    seed=None,
    progress_bar_type='tqdm'  # Options: 'tqdm', 'tqdm_notebook', None
):
    """
    Optimized function to generate negative samples for a user-item interaction DataFrame.
    """
    
    # Handle random seed
    if seed is not None:
        np.random.seed(seed)
    
    # Import tqdm based on the progress_bar_type
    if progress_bar_type == 'tqdm':
        try:
            from tqdm import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm is not installed. Please install it using 'pip install tqdm'.")
    elif progress_bar_type == 'tqdm_notebook':
        try:
            from tqdm.notebook import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm.notebook is not available. Please install it using 'pip install tqdm'.")
    elif progress_bar_type is None:
        # Define a dummy tqdm function that does nothing
        def tqdm_bar(iterable, **kwargs):
            return iterable
    else:
        raise ValueError("Invalid progress_bar_type. Choose 'tqdm', 'tqdm_notebook', or None.")
    
    # Calculate item popularity based on the number of interactions
    item_popularity = df[item_col].value_counts()
    
    # Define all unique items from the DataFrame
    items = item_popularity.index.values
    all_items_set = set(items)
    
    # Create a user-item interaction dictionary
    user_item_dict = df.groupby(user_col)[item_col].apply(set).to_dict()
    
    # Prepare items list and corresponding popularity array
    popularity = item_popularity.values.astype(np.float64)
    
    # Calculate sampling probabilities based on item popularity
    total_popularity = popularity.sum()
    if total_popularity == 0:
        sampling_probs = np.ones(len(items)) / len(items)
    else:
        sampling_probs = popularity / total_popularity
    
    # Create item to index mapping for quick access
    item_to_index = {item: idx for idx, item in enumerate(items)}
    
    # Initialize a list to store negative samples
    negative_samples = []
    
    # Initialize the progress bar
    total_users = len(user_item_dict)
    progress_bar = tqdm_bar(user_item_dict.items(), total=total_users, desc="Generating Negative Samples")
    
    for user, pos_items in progress_bar:
        num_pos = len(pos_items)
        
        # Identify items not interacted with by the user
        negative_candidates = all_items_set - pos_items
        num_neg_candidates = len(negative_candidates)
        
        if num_neg_candidates == 0:
            # User has interacted with all items, skip negative sampling
            continue
        
        # Determine the number of negative samples to generate
        num_neg = min(num_pos, num_neg_candidates)
        
        # Convert set to list for indexing
        negative_candidates_list = list(negative_candidates)
        
        # Get the indices and probabilities of negative candidates
        candidate_indices = [item_to_index[item] for item in negative_candidates_list]
        candidate_probs = sampling_probs[candidate_indices]
        candidate_probs /= candidate_probs.sum()
        
        # Sample negative items without replacement
        sampled_items = np.random.choice(
            negative_candidates_list, size=num_neg, replace=False, p=candidate_probs
        )
        
        # Append the sampled negative items to the list
        negative_samples.extend([(user, item) for item in sampled_items])
    
    # Convert negative samples to a DataFrame
    df_negative = pd.DataFrame(negative_samples, columns=[user_col, item_col])
    df_negative[label_col] = neg_label  # Assign label for negative samples
    
    return df_negative

def add_features_to_neg_df(pos_df, neg_df, user_col, timestamp_col, feature_cols=[]):
    neg_df = neg_df.assign(
        timestamp_pseudo=lambda df: df.groupby(user_col).cumcount() + 1
    )
    neg_df = (
        pd.merge(
            neg_df,
            pos_df.assign(
                timestamp_pseudo=lambda df: df.groupby([user_col])[timestamp_col].rank(method='first')   
            )[[user_col, timestamp_col, 'timestamp_pseudo', *feature_cols]],
            how='left',
            on=[user_col, 'timestamp_pseudo']
        )
        .drop(columns=['timestamp_pseudo'])
    )
    return neg_df

In [5]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [6]:
neg_df = generate_negative_samples(df, progress_bar_type='tqdm_notebook')
neg_df = add_features_to_neg_df(df, neg_df, 'user_indice', 'timestamp')

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
neg_df.sort_values(['user_indice', args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
1,1,106,0,2
2,1,105,0,4
3,2,102,0,1
4,2,105,0,2
5,3,102,0,1
6,3,103,0,5


# Load data

In [8]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [9]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-09-21 08:53:33.414[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=np.int64(1628641464793)[0m


In [10]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
0,AFSP4K4T5WBMQ3ZMDZ46QWLVQVBQ,B00001IVB4,5.0,942965209000,9372,1627,Video Games,Sim Theme Park - PC,[],"[Video Games, PC, Games]",35.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AGR5GFWFLYXKD7ZQS5AEAZUQR5QA,B00002NDRY,3.0,947856017000,10047,4574,Video Games,Age of Empires 2: Age of Kings - PC,"[Product description, Age of Empires II: Age o...","[Video Games, PC, Games]",64.88,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AEXEI37RJQEQDQLYNH3QCJTF6A7Q,B001E91OQA,5.0,951150553000,18020,2452,Video Games,Roller Coaster Tycoon - PC,"[Amazon.com, Design your own roller coaster. S...","[Video Games, PC, Games]",40.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B001EYUWY0,5.0,952016747000,10065,1214,Video Games,Unreal Tournament - PlayStation 2,"[Product Description, For the first time ever,...","[Video Games, Legacy Systems, PlayStation Syst...",41.53,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AEDPPLOVCQATUO5VTZDQSAD6MBJQ,B00001KUII,5.0,952143204000,10065,847,Video Games,Half-Life: Game of the Year Edition - PC,"[Product description, The critics agree. Half-...","[Video Games, PC, Games]",41.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
944,AEKYV77UMZZGHT4PZIETDQ6ELJBQ,B08F4C6HCD,5.0,1657816667680,14453,1618,Video Games,Legend of Zelda Link's Awakening - Nintendo Sw...,"[“Castaway, you should know the truth!” As Lin...","[Video Games, Nintendo Switch, Games]",59.88,"[1137, 2285, 1696, 329, 3287, 1434, 3235, 3649..."
945,AGUFCRCH7HOUQ5FQYSJETEEFAYOA,B00DBDPOZ4,5.0,1657855227062,19653,116,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99,"[-1, -1, -1, -1, -1, 320, 3342, 4354, 3523, 1377]"
946,AHJUZFMUESAEQBPC2QQMBDVUBYFQ,B0B1PB5L93,4.0,1657883331431,2970,1750,Computers,Razer Viper Ultimate Lightweight Wireless Gami...,[Forget about average and claim the unfair adv...,"[Video Games, PC, Accessories, Gaming Mice]",89.99,"[-1, -1, -1, 836, 1252, 364, 566, 483, 1590, 371]"
947,AE5UUBPDQX4MRFFDW7D3IKHQYIEQ,B00ZJBSBD8,5.0,1657945454164,852,689,Video Games,Trackmania Turbo-Nla,[Step into the wild car fantasy world of Track...,"[Video Games, PlayStation 4, Games]",13.68,"[1623, 4019, 1097, 1481, 4197, 4292, 1602, 130..."


In [11]:
features = ['item_sequence', 'user_id']

In [12]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
1,1,106,0,2
2,1,105,0,4
3,2,102,0,1
4,2,105,0,2
5,3,102,0,1
6,3,103,0,5


In [13]:
neg_df = generate_negative_samples(full_df, 'user_indice', 'item_indice', args.rating_col, neg_label=0, seed=args.random_seed, progress_bar_type='tqdm_notebook')
neg_ts_df = add_features_to_neg_df(full_df, neg_df, 'user_indice', args.timestamp_col, features)
neg_ts_df

Generating Negative Samples:   0%|          | 0/20366 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,1116,0,1361438564000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGRCOIBXZHTAJQUVKJQMVJPSK6JA
1,0,214,0,1361438952000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGRCOIBXZHTAJQUVKJQMVJPSK6JA
2,0,3147,0,1361439949000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGRCOIBXZHTAJQUVKJQMVJPSK6JA
3,0,206,0,1361440181000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 371...",AGRCOIBXZHTAJQUVKJQMVJPSK6JA
4,0,513,0,1361440412000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3711.0, 4...",AGRCOIBXZHTAJQUVKJQMVJPSK6JA
...,...,...,...,...,...,...
170989,20365,1450,0,1356195631000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHAMGTQVOIWTGBKSXNDICWLLYVJQ
170990,20365,4589,0,1356195680000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 541...",AHAMGTQVOIWTGBKSXNDICWLLYVJQ
170991,20365,116,0,1356195807000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 541.0, 40...",AHAMGTQVOIWTGBKSXNDICWLLYVJQ
170992,20365,2054,0,1373669628000,"[-1.0, -1.0, -1.0, -1.0, -1.0, 541.0, 4085.0, ...",AHAMGTQVOIWTGBKSXNDICWLLYVJQ


#### Join with features

In [14]:
not_meta_feature_cols = (args.user_col, 'user_indice', 'item_indice', args.rating_col, args.timestamp_col, *features)
meta_features = [col for col in full_df.columns if col not in not_meta_feature_cols]
meta_features

['parent_asin', 'main_category', 'title', 'description', 'categories', 'price']

In [15]:
neg_ts_df = (
    pd.merge(
        neg_ts_df,
        full_df[['item_indice', *meta_features]].drop_duplicates(subset=['item_indice']),
        how='left',
        on=['item_indice'],
        validate="m:1"
    )   
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin,main_category,title,description,categories,price
0,0,1116,0,1361438564000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGRCOIBXZHTAJQUVKJQMVJPSK6JA,B000P46NKC,Video Games,Resident Evil 4,[In Resident Evil 4 you'll know a new type of ...,"[Video Games, Legacy Systems, Nintendo Systems...",44.98
1,0,214,0,1361438952000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGRCOIBXZHTAJQUVKJQMVJPSK6JA,B07PZ8NZSZ,All Electronics,PlayStation 4 Pro 1TB Console,"[PS4 Pro, 4K TV GAMING & MORE, The most advanc...","[Video Games, PlayStation 4, Consoles]",
2,0,3147,0,1361439949000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGRCOIBXZHTAJQUVKJQMVJPSK6JA,B002BSC54I,Video Games,The Legend of Zelda: Skyward Sword with Music CD,"[A Link to the Future, One of the most storied...","[Video Games, Legacy Systems, Nintendo Systems...",21.2
3,0,206,0,1361440181000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 371...",AGRCOIBXZHTAJQUVKJQMVJPSK6JA,B06XBN6NCH,Video Games,Middle-Earth: Shadow Of War - PlayStation 4,[The sequel to the critically-acclaimed Middle...,"[Video Games, PlayStation 4, Games]",
4,0,513,0,1361440412000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3711.0, 4...",AGRCOIBXZHTAJQUVKJQMVJPSK6JA,B002I0J51U,Video Games,Playstation 3 Move Motion Controller,"[Product Description, PlayStation Move redefin...","[Video Games, Legacy Systems, PlayStation Syst...",43.5
...,...,...,...,...,...,...,...,...,...,...,...,...
170989,20365,1450,0,1356195631000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AHAMGTQVOIWTGBKSXNDICWLLYVJQ,B07YBWT3PK,Video Games,Fallout 4 - Xbox One,"[Bethesda Game Studios, the award-winning crea...","[Video Games, Xbox One, Games]",8.16
170990,20365,4589,0,1356195680000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 541...",AHAMGTQVOIWTGBKSXNDICWLLYVJQ,B002JTX7HI,Video Games,Lord of the Rings: Aragorn's Quest - PlayStati...,"[Product Description, Play as the legendary Ar...","[Video Games, Legacy Systems, PlayStation Syst...",13.97
170991,20365,116,0,1356195807000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 541.0, 40...",AHAMGTQVOIWTGBKSXNDICWLLYVJQ,B00DBDPOZ4,Video Games,Xbox One Play and Charge Kit,[Keep the action going with the Xbox One Play ...,"[Video Games, Xbox One, Accessories]",34.99
170992,20365,2054,0,1373669628000,"[-1.0, -1.0, -1.0, -1.0, -1.0, 541.0, 4085.0, ...",AHAMGTQVOIWTGBKSXNDICWLLYVJQ,B07VLCRZ21,Video Games,Sniper: Ghost Warrior - Xbox 360,[The democratic government of Isla Trueno has ...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",14.48


In [16]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]['item_indice']
assert len(set(neg_item_indices)) == 1, f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_df.loc[lambda df: df[args.item_col].eq(item)]['item_indice']
assert len(set(original_item_indices)) == 1, f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2024-09-21 08:53:42.787[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B015Z31TVA...[0m


In [17]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(frac=1, replace=False, random_state=args.random_seed)

In [18]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AHCEA6WBESM5FK4G5CDKDEPKKGSQ,B007WPGT2Y,0.0,1569331495325,13244,3,Video Games,Lost Planet 3 - Playstation 3,"[Product Description, The latest chapter in th...","[Video Games, Legacy Systems, PlayStation Syst...",22.5,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2524.0, 2..."
65458,AG4PSRFHA4J4I52AMJETLGJ4HW2A,B005QP3MGI,0.0,1491364057000,7728,928,Video Games,PS3 New Owner's Kit,[Official New Owner’s Kit – Essential accessor...,[],,"[1818.0, 2213.0, 1450.0, 2325.0, 1285.0, 2565...."
79874,AGMRJJEY7OS6ULDZY25CTTPIWLQQ,B001EYUPSS,0.0,1288372826000,9439,1979,Video Games,Dr. Mario,[CARTRIDGE ONLY. cartridge is in good shape as...,"[Video Games, Legacy Systems, Nintendo Systems...",20.35,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
102744,AG5QFNBLUBNZPIC7ZY5VXBBYBU7Q,B00MUMVV3G,0.0,1385142362000,12206,2549,Video Games,Tales of Xillia 2 - PlayStation 3,[TALES OF XILLIA 2 IS SET A YEAR AFTER TALES O...,"[Video Games, Legacy Systems, PlayStation Syst...",39.95,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 349..."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,15823,2833,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 88.0, 280..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AHEMMMLBCKH56UMHL4JOLPNVM7OQ,B00O3JSRHW,0.0,1527498673890,9449,3323,Video Games,Freedom Wars - PS Vita [Digital Code],"[In the not-too-distant future, atmospheric po...","[Video Games, Legacy Systems, PlayStation Syst...",,"[4117.0, 3855.0, 4583.0, 1176.0, 1050.0, 4180...."
46064,AFZVQS3TKOPG4UE2XC6VS4ASQJUQ,B01GY35QPU,0.0,1474906869000,5435,2649,Video Games,Detroit Become Human - PlayStation 4,[Detroit: Become Human is the latest title in ...,"[Video Games, PlayStation 4, Games]",21.4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 245..."
152474,AGIYR2AKHRPWYSJIJARKK6X3GTPA,B001E8WQI6,0.0,1340164465000,18160,3844,Video Games,Red Faction Guerrilla - Xbox 360,"[Product Description, Set 50 years after the c...","[Video Games, Legacy Systems, Xbox Systems, Xb...",17.82,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,8288,2708,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 137..."


In [19]:
key_cols = [args.user_col, args.item_col, 'user_indice', 'item_indice', 'item_sequence', args.rating_col, args.timestamp_col]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [20]:
val_timestamp

np.int64(1628641464793)

In [21]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [22]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
111519,AHCEA6WBESM5FK4G5CDKDEPKKGSQ,B007WPGT2Y,0.0,1569331495325,13244,3,Video Games,Lost Planet 3 - Playstation 3,"[Product Description, The latest chapter in th...","[Video Games, Legacy Systems, PlayStation Syst...",22.5,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2524.0, 2..."
65458,AG4PSRFHA4J4I52AMJETLGJ4HW2A,B005QP3MGI,0.0,1491364057000,7728,928,Video Games,PS3 New Owner's Kit,[Official New Owner’s Kit – Essential accessor...,[],,"[1818.0, 2213.0, 1450.0, 2325.0, 1285.0, 2565...."
79874,AGMRJJEY7OS6ULDZY25CTTPIWLQQ,B001EYUPSS,0.0,1288372826000,9439,1979,Video Games,Dr. Mario,[CARTRIDGE ONLY. cartridge is in good shape as...,"[Video Games, Legacy Systems, Nintendo Systems...",20.35,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
102744,AG5QFNBLUBNZPIC7ZY5VXBBYBU7Q,B00MUMVV3G,0.0,1385142362000,12206,2549,Video Games,Tales of Xillia 2 - PlayStation 3,[TALES OF XILLIA 2 IS SET A YEAR AFTER TALES O...,"[Video Games, Legacy Systems, PlayStation Syst...",39.95,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 349..."
56463,AFAUFOADANHRMH3GR2FA73A6NHIA,B08VFQ3XJX,5.0,1408655680000,15823,2833,Video Games,Final Fantasy X,"[Product Description, Final Fantasy X finally ...","[Video Games, Legacy Systems, PlayStation Syst...",20.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 88.0, 280..."
...,...,...,...,...,...,...,...,...,...,...,...,...
79966,AHEMMMLBCKH56UMHL4JOLPNVM7OQ,B00O3JSRHW,0.0,1527498673890,9449,3323,Video Games,Freedom Wars - PS Vita [Digital Code],"[In the not-too-distant future, atmospheric po...","[Video Games, Legacy Systems, PlayStation Syst...",,"[4117.0, 3855.0, 4583.0, 1176.0, 1050.0, 4180...."
46064,AFZVQS3TKOPG4UE2XC6VS4ASQJUQ,B01GY35QPU,0.0,1474906869000,5435,2649,Video Games,Detroit Become Human - PlayStation 4,[Detroit: Become Human is the latest title in ...,"[Video Games, PlayStation 4, Games]",21.4,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 245..."
152474,AGIYR2AKHRPWYSJIJARKK6X3GTPA,B001E8WQI6,0.0,1340164465000,18160,3844,Video Games,Red Faction Guerrilla - Xbox 360,"[Product Description, Set 50 years after the c...","[Video Games, Legacy Systems, Xbox Systems, Xb...",17.82,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
132003,AEFJKBRX2TPPMMJ53DKHANKRYDXQ,B08N6NMGNB,5.0,1510350967758,8288,2708,Video Games,Thrustmaster T300 RS - Gran Turismo Edition Ra...,[Works with PS5 games (PS5 games compatibility...,"[Video Games, Legacy Systems, PlayStation Syst...",449.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 137..."


In [23]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
94632,AFKTALCJYFHSQUWRNFWPZKGCCXRQ,B00PGLF7G0,0.0,1633973559440,11207,1536,Video Games,DEAD OR ALIVE 5 Last Round - Xbox One,[DEAD OR ALIVE 5 Last Round will take to the r...,"[Video Games, Xbox One, Games]",59.91,"[-1, -1, 2691, 1450, 1775, 2796, 3327, 4451, 4..."
74,AEWCUX5UKUYPDZJIOB6XMLCBJ3KA,B0BLFYF8K2,4.0,1630263342566,8653,3753,Computers,"Logitech G600 MMO Gaming Mouse, RGB Backlit, 2...","[With 20 buttons, the Logitech G600 MMO Gaming...","[Video Games, PC, Accessories, Gaming Mice]",37.99,"[2750, 4104, 1828, 584, 3554, 2394, 3319, 1910..."
892,AFFPVZ3JNCTQIKAK4XK37E2ENWWA,B00HVBPRUO,4.0,1655428133046,18655,841,Video Games,Gold Wireless Stereo Headset - PlayStation 4,[A Headset for Gamers: Experience everything f...,"[Video Games, PlayStation 4, Accessories, Head...",,"[-1, -1, 2511, 4153, 1222, 3498, 2502, 341, 45..."
285,AFBRTNVOROW7UVA66UPX5YCFC6MQ,B07YBXFDYK,3.0,1636189764550,4825,3017,Video Games,The Evil Within 2 - PlayStation 4,"[From Shinji Mikami, The Evil Within 2 takes t...","[Video Games, PlayStation 4, Games]",20.98,"[-1, -1, -1, -1, 1863, 3224, 1344, 2044, 1092,..."
169632,AFGK4NQ4PAGXYAXVYEVKTOJRBP7Q,B01JJFZ3LC,0.0,1633408008844,20199,4210,Video Games,Nintendo 2DS - Electric Blue with Mario Kart 7,[The Best of Two Worlds. The Nintendo 2DS syst...,"[Video Games, Legacy Systems, Nintendo Systems...",168.98,"[-1, -1, -1, -1, -1, 3266, 1794, 1681, 3646, 1..."
...,...,...,...,...,...,...,...,...,...,...,...,...
532,AFUWPAK6VCGEL2OVIL2YGZNFQJZQ,B08N6NCR3Q,4.0,1642699950266,631,4313,Video Games,Thrustmaster T 16000M SPACE SIM DUO STICK (PC),[The THRUSTMASTER T.16000M FCS Space Sim Duo c...,"[Video Games, PC, Accessories, Controllers, Fl...",119.51,"[-1, -1, -1, -1, 4624, 4653, 3707, 1750, 4544,..."
815,AFH63KLSVQQYRNFS7NLQGD3GSP3A,B094YHB1QK,5.0,1652564728981,2779,2376,Video Games,PlayStation DualSense Wireless Controller – Ga...,[Plot a course for astronomical adventures on ...,"[Video Games, PlayStation 5, Accessories, Cont...",74.99,"[-1, 1575, 1646, 3404, 4509, 2412, 4415, 238, ..."
138177,AFZZ5GK3C5W3QWHT6AGHDMEVJ24A,B008A27UMG,0.0,1642015738896,16457,4110,Video Games,Harvest Moon 3D: A New Beginning - Nintendo 3DS,[Echo Village has seen better days... Just as ...,"[Video Games, Legacy Systems, Nintendo Systems...",28.99,"[1433, 0, 567, 403, 4243, 4127, 4483, 850, 115..."
280,AFPPTJOEUPVXA5C63SNRGID3EQNA,B0BVVTQ5JP,4.0,1635968491390,15461,757,Computers,Logitech G502 HERO High Performance Wired Gami...,[Logitech updated its iconic G502 gaming mouse...,"[Video Games, PC, Accessories, Gaming Mice]",45.87,"[-1, -1, -1, -1, -1, 3837, 4230, 165, 4063, 3917]"


# Checks

In [24]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(args.timestamp_col)
assert check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2, "Number of pos and neg samples are not equal"

[32m2024-09-21 08:53:42.930[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AH4LDGDVSHW3XVGRQSK2H7X6H6GA...[0m


In [25]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(args.timestamp_col)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5, f"Item {item} does not appear much in training data"

[32m2024-09-21 08:53:42.943[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mChecking item B07Z9Z39ZW...[0m


## Random eye-ball

In [26]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
55709,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B009M72E5Q,0.0,1392130361000,6619,163,Video Games,Nintendo Wii Console Black with Wii Sports and...,"[Includes Black Wii Console, Black Wii Remote ...","[Video Games, Legacy Systems, Nintendo Systems...",899.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
47050,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B001EYURCC,5.0,1392130361000,6619,2579,Video Games,Dynasty Warriors 6 - Playstation 3,"[Based on the epic novel, Romance of the Three...","[Video Games, Legacy Systems, PlayStation Syst...",50.39,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
67491,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B008KGN9DG,3.0,1420552659000,6619,196,Video Games,Angry Birds Trilogy - Nintendo Wii,"[Gather your flock and sling into action, Angr...","[Video Games, Legacy Systems, Nintendo Systems...",39.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
55710,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B000IMWK2G,0.0,1420552659000,6619,1742,Video Games,Wii Remote Controller,"[Product Description, Wii Remote Controller, A...","[Video Games, Legacy Systems, Nintendo Systems...",61.0,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
132713,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B06XBN6NCH,5.0,1511804361580,6619,206,Video Games,Middle-Earth: Shadow Of War - PlayStation 4,[The sequel to the critically-acclaimed Middle...,"[Video Games, PlayStation 4, Games]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
55711,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B06XS4XXFT,0.0,1511804361580,6619,2135,Video Games,Super Bomberman R - Nintendo Switch,[A fast paced 1 to 8 player party action game ...,"[Video Games, Nintendo Switch, Accessories]",18.7,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
132714,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B0767Y2Z49,5.0,1511804377861,6619,2415,Video Games,Fire Emblem Warriors + Season Pass Bundle - Ni...,[Bundle Includes: Fire Emblem Warriors Full Ga...,"[Video Games, Nintendo Switch, Games]",79.98,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 257..."
55712,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B00Z9TIGCG,0.0,1511804377861,6619,3380,Video Games,Tom Clancy's Ghost Recon Wildlands - Season Pa...,[Extend your Ghost’s experience with the Seaso...,"[Video Games, PC]",,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 257..."
158438,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B07STWQ38X,5.0,1573002727067,6619,1297,Video Games,Horizon Zero Dawn Complete Edition Hits - Play...,[Horizon Zero Dawn is an exhilarating action r...,"[Video Games, PlayStation 4, Games]",17.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2579.0, 1..."
55713,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B07R9PBHP2,0.0,1573002727067,6619,2599,Video Games,Ace Combat 7: Skies Unknown - Xbox One,[Putting gamers in the cockpit of the most adv...,"[Video Games, Xbox One, Games]",19.99,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 2579.0, 1..."


In [27]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,main_category,title,description,categories,price,item_sequence
170,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B07Z9Z39ZW,5.0,1632602577680,6619,2074,Video Games,Witcher 3: Wild Hunt Complete Edition - Ninten...,[],"[Video Games, Nintendo Switch, Games]",41.54,"[-1, -1, -1, 2579, 196, 206, 2415, 1297, 398, ..."
55716,AH4LDGDVSHW3XVGRQSK2H7X6H6GA,B006VR65D2,0.0,1632602577680,6619,1747,Video Games,Transformers: Fall of Cybertron [Download],"[From the Manufacturer, Transformers: Fall of ...","[Video Games, PC, Games]",49.99,"[-1, -1, -1, 2579, 196, 206, 2415, 1297, 398, ..."


# Persist

In [28]:
full_df.to_parquet('../data/full_features_neg_sampling_df.parquet', index=False)

In [29]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)