In [1]:
import os
import pandas as pd
import numpy as np
from loguru import logger
from collections import defaultdict
from pydantic import BaseModel

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = '056-small-rich-dataset'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "056-small-rich-dataset",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/056-small-rich-dataset",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(interactions, columns=['user_indice', 'item_indice', args.rating_col, args.timestamp_col])

In [4]:
def generate_negative_samples(
    df,
    user_col='user_indice',
    item_col='item_indice',
    label_col='rating',
    neg_label=0,
    seed=None,
    progress_bar_type='tqdm'  # Options: 'tqdm', 'tqdm_notebook', None
):
    """
    Optimized function to generate negative samples for a user-item interaction DataFrame.
    """
    
    # Handle random seed
    if seed is not None:
        np.random.seed(seed)
    
    # Import tqdm based on the progress_bar_type
    if progress_bar_type == 'tqdm':
        try:
            from tqdm import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm is not installed. Please install it using 'pip install tqdm'.")
    elif progress_bar_type == 'tqdm_notebook':
        try:
            from tqdm.notebook import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm.notebook is not available. Please install it using 'pip install tqdm'.")
    elif progress_bar_type is None:
        # Define a dummy tqdm function that does nothing
        def tqdm_bar(iterable, **kwargs):
            return iterable
    else:
        raise ValueError("Invalid progress_bar_type. Choose 'tqdm', 'tqdm_notebook', or None.")
    
    # Calculate item popularity based on the number of interactions
    item_popularity = df[item_col].value_counts()
    
    # Define all unique items from the DataFrame
    items = item_popularity.index.values
    all_items_set = set(items)
    
    # Create a user-item interaction dictionary
    user_item_dict = df.groupby(user_col)[item_col].apply(set).to_dict()
    
    # Prepare items list and corresponding popularity array
    popularity = item_popularity.values.astype(np.float64)
    
    # Calculate sampling probabilities based on item popularity
    total_popularity = popularity.sum()
    if total_popularity == 0:
        sampling_probs = np.ones(len(items)) / len(items)
    else:
        sampling_probs = popularity / total_popularity
    
    # Create item to index mapping for quick access
    item_to_index = {item: idx for idx, item in enumerate(items)}
    
    # Initialize a list to store negative samples
    negative_samples = []
    
    # Initialize the progress bar
    total_users = len(user_item_dict)
    progress_bar = tqdm_bar(user_item_dict.items(), total=total_users, desc="Generating Negative Samples")
    
    for user, pos_items in progress_bar:
        num_pos = len(pos_items)
        
        # Identify items not interacted with by the user
        negative_candidates = all_items_set - pos_items
        num_neg_candidates = len(negative_candidates)
        
        if num_neg_candidates == 0:
            # User has interacted with all items, skip negative sampling
            continue
        
        # Determine the number of negative samples to generate
        num_neg = min(num_pos, num_neg_candidates)
        
        # Convert set to list for indexing
        negative_candidates_list = list(negative_candidates)
        
        # Get the indices and probabilities of negative candidates
        candidate_indices = [item_to_index[item] for item in negative_candidates_list]
        candidate_probs = sampling_probs[candidate_indices]
        candidate_probs /= candidate_probs.sum()
        
        # Sample negative items without replacement
        sampled_items = np.random.choice(
            negative_candidates_list, size=num_neg, replace=False, p=candidate_probs
        )
        
        # Append the sampled negative items to the list
        negative_samples.extend([(user, item) for item in sampled_items])
    
    # Convert negative samples to a DataFrame
    df_negative = pd.DataFrame(negative_samples, columns=[user_col, item_col])
    df_negative[label_col] = neg_label  # Assign label for negative samples
    
    return df_negative

def add_features_to_neg_df(pos_df, neg_df, user_col, timestamp_col, feature_cols=[]):
    neg_df = neg_df.assign(
        timestamp_pseudo=lambda df: df.groupby(user_col).cumcount() + 1
    )
    neg_df = (
        pd.merge(
            neg_df,
            pos_df.assign(
                timestamp_pseudo=lambda df: df.groupby([user_col])[timestamp_col].rank(method='first')   
            )[[user_col, timestamp_col, 'timestamp_pseudo', *feature_cols]],
            how='left',
            on=[user_col, 'timestamp_pseudo']
        )
        .drop(columns=['timestamp_pseudo'])
    )
    return neg_df

In [5]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [6]:
neg_df = generate_negative_samples(df, progress_bar_type='tqdm_notebook')
neg_df = add_features_to_neg_df(df, neg_df, 'user_indice', 'timestamp')

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
neg_df.sort_values(['user_indice', args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,106,0,1
1,1,105,0,2
2,1,104,0,4
3,2,103,0,1
4,2,106,0,2
5,3,101,0,1
6,3,102,0,5


# Load data

In [8]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [9]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-09-20 18:59:08.087[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=np.int64(1628623121512)[0m


In [10]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
0,AHBOFFMSOTEBSVHOOD232ZHFCEWQ,B00001IVRD,4.0,961125766000,Video Games,StarCraft Battle Chest - PC/Mac,"[Product description, StarCraft Battle Chest, ...","[Video Games, PC, Games]",81.88,5347,2587,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AGSZX3YVKUDHRIBKZUMYWM5KNXLQ,B001EYUPLK,2.0,963407426000,Video Games,Diablo 2: Collector's Edition,"[Amazon.com, In, Diablo II,, players return to...","[Video Games, PC, Games]",,4249,2083,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AGAHLNZNWZMCGFM552XFS6XRAJ5Q,B00000J9J9,5.0,963523537000,Video Games,Super Mario Bros. Deluxe,"[Product Description, Recapture the magic of t...","[Video Games, Legacy Systems, Nintendo Systems...",51.49,6257,188,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AGAHLNZNWZMCGFM552XFS6XRAJ5Q,B00002STEZ,4.0,963524433000,Video Games,Donkey Kong 64 - Nintendo 64,"[Product Description, Every so often a game en...","[Video Games, Legacy Systems, Nintendo Systems...",,6257,8,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AEUJQYSQIJJ2RVKNTC7J5NLR2JPA,B00000DMAX,4.0,965368820000,Video Games,Mario Kart 64,"[Product description, Three... Two... One... G...","[Video Games, Legacy Systems, Nintendo Systems...",68.99,8921,118,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
392,AG6MENO5OO7LRCH27J47ZODEZN6Q,B0728H6KPV,5.0,1657815635870,Cell Phones & Accessories,"Charger for Nintendo Switch, YCCTEAM Charger A...",[],"[Video Games, Legacy Systems, Nintendo Systems...",13.99,50,2203,"[-1, -1, 2569, 1380, 1344, 1342, 2224, 1328, 5..."
393,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,B0716CXJ1R,5.0,1657835528863,Video Games,Darksiders III - Collector's Edition - Xbox One,[],"[Video Games, Xbox One, Games]",149.99,4352,1347,"[691, 1944, 2013, 456, 395, 863, 2152, 2439, 2..."
394,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,B07D13QGXM,5.0,1657836034795,Video Games,Minecraft - Nintendo Switch,[Minecraft is a game about placing blocks and ...,"[Video Games, Nintendo Switch, Games]",29.99,4352,2469,"[1944, 2013, 456, 395, 863, 2152, 2439, 2630, ..."
395,AFDL3ZQE4ARYEEBBH2KAPMP4NSHQ,B0795GHTBC,5.0,1657910674213,All Electronics,ivoler [3 Pack Screen Protector Tempered Glass...,[],"[Video Games, Nintendo Switch, Accessories, Fa...",9.39,4522,1580,"[-1, -1, -1, -1, -1, 1778, 745, 598, 1358, 684]"


In [11]:
features = ['item_sequence', 'user_id']

In [12]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,106,0,1
1,1,105,0,2
2,1,104,0,4
3,2,103,0,1
4,2,106,0,2
5,3,101,0,1
6,3,102,0,5


In [13]:
neg_df = generate_negative_samples(full_df, 'user_indice', 'item_indice', args.rating_col, neg_label=0, seed=args.random_seed, progress_bar_type='tqdm_notebook')
neg_ts_df = add_features_to_neg_df(full_df, neg_df, 'user_indice', args.timestamp_col, features)
neg_ts_df

Generating Negative Samples:   0%|          | 0/10371 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,687,0,1407185599000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF2PANZKQINF243GB7ZACW2EEEIA
1,0,122,0,1427223938000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF2PANZKQINF243GB7ZACW2EEEIA
2,0,1896,0,1427403346000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF2PANZKQINF243GB7ZACW2EEEIA
3,0,115,0,1427403360000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 116...",AF2PANZKQINF243GB7ZACW2EEEIA
4,0,318,0,1427403377000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 116.0, 28...",AF2PANZKQINF243GB7ZACW2EEEIA
...,...,...,...,...,...,...
85211,10370,1209,0,1399146320000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF5PTKA5SOJWSJ7C36SOEQJIZ25A
85212,10370,176,0,1399155050000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF5PTKA5SOJWSJ7C36SOEQJIZ25A
85213,10370,1321,0,1399155128000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF5PTKA5SOJWSJ7C36SOEQJIZ25A
85214,10370,1687,0,1399155276000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 803...",AF5PTKA5SOJWSJ7C36SOEQJIZ25A


#### Join with features

In [14]:
not_meta_feature_cols = (args.user_col, 'user_indice', 'item_indice', args.rating_col, args.timestamp_col, *features)
meta_features = [col for col in full_df.columns if col not in not_meta_feature_cols]
meta_features

['parent_asin', 'main_category', 'title', 'description', 'categories', 'price']

In [15]:
neg_ts_df = (
    pd.merge(
        neg_ts_df,
        full_df[['item_indice', *meta_features]].drop_duplicates(subset=['item_indice']),
        how='left',
        on=['item_indice'],
        validate="m:1"
    )   
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin,main_category,title,description,categories,price
0,0,687,0,1407185599000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF2PANZKQINF243GB7ZACW2EEEIA,B07WT56LHX,Video Games,WB Games Mortal Kombat XL - Playstation 4,"[Includes main game, and new playable characte...","[Video Games, PlayStation 4, Games]",19.99
1,0,122,0,1427223938000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF2PANZKQINF243GB7ZACW2EEEIA,B07WS18ZS3,Video Games,Batman: Arkham Knight - Playstation 4,"[For PlayStation owners, Batman Arkham Knight ...","[Video Games, PlayStation 4, Games]",26.97
2,0,1896,0,1427403346000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF2PANZKQINF243GB7ZACW2EEEIA,B003XWD1G2,Computers,HDE (TM) Ethernet LAN Adapter for Nintendo Wii...,"[Do you like the Wii network, online gaming, a...","[Video Games, Legacy Systems, Nintendo Systems...",
3,0,115,0,1427403360000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 116...",AF2PANZKQINF243GB7ZACW2EEEIA,B00EN9Q8G4,,Forza Motorsport 5,"[Product description, *The DLC (Downloadable C...","[Video Games, Xbox One, Games]",64.98
4,0,318,0,1427403377000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 116.0, 28...",AF2PANZKQINF243GB7ZACW2EEEIA,B07SM7G9CN,Video Games,Donkey Kong Country: Tropical Freeze - Nintend...,[Barrel-blast into a critically acclaimed Donk...,"[Video Games, Nintendo Switch, Games]",52.49
...,...,...,...,...,...,...,...,...,...,...,...,...
85211,10370,1209,0,1399146320000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF5PTKA5SOJWSJ7C36SOEQJIZ25A,B07YBXFF99,Video Games,Dishonored - PlayStation 3 Game of the Year Ed...,[Experience the definitive Dishonored collecti...,"[Video Games, Legacy Systems, PlayStation Syst...",19.99
85212,10370,176,0,1399155050000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF5PTKA5SOJWSJ7C36SOEQJIZ25A,B01MG8P418,Video Games,Steam Controller,[The Steam Controller frees you from your keyb...,"[Video Games, PC, Accessories, Controllers, Ga...",268.01
85213,10370,1321,0,1399155128000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AF5PTKA5SOJWSJ7C36SOEQJIZ25A,B00ZQB28XK,Video Games,No Man's Sky - PlayStation 4,[Inspired by classic science-fiction and its o...,"[Video Games, PlayStation 4, Games]",39.89
85214,10370,1687,0,1399155276000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 803...",AF5PTKA5SOJWSJ7C36SOEQJIZ25A,B00O9GW8TC,Video Games,Nintendo Super Smash Bros. White Classic Gamec...,[Many Super Smash Bros. fans grew up playing S...,"[Video Games, Legacy Systems, Nintendo Systems...",213.17


In [16]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]['item_indice']
assert len(set(neg_item_indices)) == 1, f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_df.loc[lambda df: df[args.item_col].eq(item)]['item_indice']
assert len(set(original_item_indices)) == 1, f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2024-09-20 18:59:11.045[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B00005R5PO...[0m


In [17]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(frac=1, replace=False, random_state=args.random_seed)

In [18]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
44784,AFFGHZYBGYGBP2LCBEPESGMSZOOA,B0BS9YCBYY,0.0,1377212339000,Computers,"HyperX Cloud Flight - Wireless Gaming Headset,...",[Escape the limits of cable connections and ro...,"[Video Games, PC, Accessories, Headsets]",69.99,5455,2573,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
24046,AF55DDBEKIR5VLK7RNJFMCKDUETA,B07N5LL4YW,5.0,1395215941000,Video Games,Knack (PlayStation 4),"[From the Manufacturer, An Unlikely Hero, Mank...","[Video Games, PlayStation 4, Games]",22.68,9521,1707,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
8978,AGMWNTWGRZRHGYDFYHCCAZIPHQAQ,B07YBX6T95,0.0,1352431054000,Video Games,Fallout: New Vegas - Xbox 360 Ultimate Edition,[Welcome Back to New Vegas!With the introducti...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",27.99,1088,50,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 169.0, 21..."
55296,AHHVS66MR6ZY6AX3WPHFD7CD7LYQ,B00IAVDOS6,5.0,1471206357000,Video Games,Xbox One Stereo Headset Adapter,[Plug your favorite compatible headset into th...,"[Video Games, Xbox One, Accessories, Cables & ...",36.97,5495,472,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
8709,AFORDMXANEXLNMRZHCEWDIKYIBPA,B001EHD9JC,0.0,1451855656000,Video Games,LEGO Batman [Download],[When all the villains in Arkham Asylum team u...,"[Video Games, PC]",,1059,2669,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 125..."
...,...,...,...,...,...,...,...,...,...,...,...,...
53491,AHU62SIRCWHUHRXGAVRLT4JSG46A,B00KMPLXH2,3.0,1465564166000,Video Games,Battlefield Hardline - PlayStation 4,[Get a piece of the action in Battlefield Hard...,"[Video Games, PlayStation 4, Games]",15.99,6540,1757,"[2181.0, 537.0, 2595.0, 1952.0, 272.0, 1922.0,..."
21601,AHX3RM25T2E5C3NKLREKCGGBMWKA,B00ZHQ39F0,0.0,1238995424000,Video Games,Just Dance 2016 - PlayStation 4,[Introducing Just Dance 2016 – the newest game...,"[Video Games, PlayStation 4, Games]",28.0,2639,2024,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 123.0, 10..."
61324,AH4HDQMK53FZAIZMARDNFCMVPDUQ,B01IC2A28C,5.0,1488194177000,Video Games,Pokémon Sun and Pokémon Moon Steelbook Dual Pa...,[Pokémon Sun and Pokémon Moon will launch on N...,"[Video Games, Legacy Systems, Nintendo Systems...",,288,37,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
46787,AFCZARGCL55326X7VAJYZYTZHQ6A,B01574SORE,0.0,1277400268000,Video Games,PlayStation 4 500GB Console - Star Wars Battle...,[Immerse yourself in your Star Wars battle fan...,"[Video Games, PlayStation 4, Consoles]",224.99,5691,1417,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


In [20]:
key_cols = [args.user_col, args.item_col, 'user_indice', 'item_indice', 'item_sequence', args.rating_col, args.timestamp_col]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [21]:
val_timestamp

np.int64(1628623121512)

In [22]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [29]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
44784,AFFGHZYBGYGBP2LCBEPESGMSZOOA,B0BS9YCBYY,0.0,1377212339000,Computers,"HyperX Cloud Flight - Wireless Gaming Headset,...",[Escape the limits of cable connections and ro...,"[Video Games, PC, Accessories, Headsets]",69.99,5455,2573,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
24046,AF55DDBEKIR5VLK7RNJFMCKDUETA,B07N5LL4YW,5.0,1395215941000,Video Games,Knack (PlayStation 4),"[From the Manufacturer, An Unlikely Hero, Mank...","[Video Games, PlayStation 4, Games]",22.68,9521,1707,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
8978,AGMWNTWGRZRHGYDFYHCCAZIPHQAQ,B07YBX6T95,0.0,1352431054000,Video Games,Fallout: New Vegas - Xbox 360 Ultimate Edition,[Welcome Back to New Vegas!With the introducti...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",27.99,1088,50,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 169.0, 21..."
55296,AHHVS66MR6ZY6AX3WPHFD7CD7LYQ,B00IAVDOS6,5.0,1471206357000,Video Games,Xbox One Stereo Headset Adapter,[Plug your favorite compatible headset into th...,"[Video Games, Xbox One, Accessories, Cables & ...",36.97,5495,472,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
8709,AFORDMXANEXLNMRZHCEWDIKYIBPA,B001EHD9JC,0.0,1451855656000,Video Games,LEGO Batman [Download],[When all the villains in Arkham Asylum team u...,"[Video Games, PC]",,1059,2669,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 125..."
...,...,...,...,...,...,...,...,...,...,...,...,...
53491,AHU62SIRCWHUHRXGAVRLT4JSG46A,B00KMPLXH2,3.0,1465564166000,Video Games,Battlefield Hardline - PlayStation 4,[Get a piece of the action in Battlefield Hard...,"[Video Games, PlayStation 4, Games]",15.99,6540,1757,"[2181.0, 537.0, 2595.0, 1952.0, 272.0, 1922.0,..."
21601,AHX3RM25T2E5C3NKLREKCGGBMWKA,B00ZHQ39F0,0.0,1238995424000,Video Games,Just Dance 2016 - PlayStation 4,[Introducing Just Dance 2016 – the newest game...,"[Video Games, PlayStation 4, Games]",28.0,2639,2024,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 123.0, 10..."
61324,AH4HDQMK53FZAIZMARDNFCMVPDUQ,B01IC2A28C,5.0,1488194177000,Video Games,Pokémon Sun and Pokémon Moon Steelbook Dual Pa...,[Pokémon Sun and Pokémon Moon will launch on N...,"[Video Games, Legacy Systems, Nintendo Systems...",,288,37,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
46787,AFCZARGCL55326X7VAJYZYTZHQ6A,B01574SORE,0.0,1277400268000,Video Games,PlayStation 4 500GB Console - Star Wars Battle...,[Immerse yourself in your Star Wars battle fan...,"[Video Games, PlayStation 4, Consoles]",224.99,5691,1417,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


In [30]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
326,AEGX7X6YKSB53B57U4RDNZMHQB2A,B0BL65X86R,5.0,1651558065496,Video Games,$25 PlayStation Store Gift Card [Digital Code],[Redeem against anything on PlayStation Store....,"[Video Games, Online Game Services, PlayStatio...",25.0,2610,1220,"[-1, -1, -1, -1, 205, 1291, 494, 1144, 3, 579]"
396,AG6MENO5OO7LRCH27J47ZODEZN6Q,B00EQNP8F4,0.0,1657815635870,Video Games,Microsoft Xbox LIVE 12 Month Gold Membership (...,[Gaming is better with Xbox Live Gold. Join th...,"[Video Games, Online Game Services, Xbox Live,...",,50,147,"[-1, -1, 2569, 1380, 1344, 1342, 2224, 1328, 5..."
259,AGSNYY5XOKFLVXUZMGXRS7DVL7EQ,B077GG9D5D,4.0,1645878084196,Video Games,DualShock 4 Wireless Controller for PlayStatio...,[The DualShock 4 Wireless Controller features ...,"[Video Games, PlayStation 4, Accessories, Cont...",57.0,377,684,"[2365, 1381, 2109, 1877, 2792, 2754, 2558, 147..."
69517,AGXKJPHUFNOZGCDLXQRKW4OSY7JA,B012F20ZY6,0.0,1644874447517,Video Games,New Super Mario Bros. U + New Super Luigi U - ...,[(2 Games on 1 Disc) New Super Mario Bros. U: ...,"[Video Games, Legacy Systems, Nintendo Systems...",60.94,8451,170,"[-1, -1, 1762, 2792, 164, 1144, 772, 923, 266,..."
10,AHRPODFXDQKRO3OUDXTYZZZNGPTQ,B07WZ78VRN,1.0,1629011678308,Computers,8Bitdo SN30 Pro Wireless Bluetooth Controller ...,[],"[Video Games, Mac, Accessories, Controllers, G...",44.99,8412,2055,"[-1, -1, -1, -1, -1, 2076, 1204, 674, 63, 1668]"
...,...,...,...,...,...,...,...,...,...,...,...,...
294,AHY7NSZXW4IUPQ2E4BPUOXUVP3UQ,B07NQCDWWN,4.0,1648662318824,Video Games,The Legend of Zelda: TriForce Heroes - 3DS,[Three players take on the role of individual ...,"[Video Games, Legacy Systems, Nintendo Systems...",39.49,4352,1701,"[1313, 691, 1944, 2013, 456, 395, 863, 2152, 2..."
298,AH62RYWZBOQXAIHUU2FNRZNMRDGA,B07H3F94ZN,5.0,1649192892568,All Electronics,HD Retrovision PlayStation 2/3 (PS2/PS3) Premi...,[Use these high-quality cables to connect your...,"[Video Games, Legacy Systems, PlayStation Syst...",29.99,4482,2205,"[-1, 2334, 2632, 1798, 226, 2728, 2541, 2777, ..."
28768,AGNUHVWLW65C3UFEKDTHNVOFTLVQ,B008UTF3W8,0.0,1643076728682,Video Games,Official Sony Playstation 3 Vertical Stand for...,[The official Vertical Stand is designed to ke...,"[Video Games, Legacy Systems, PlayStation Syst...",93.79,3514,1948,"[-1, -1, -1, 1969, 451, 1778, 1813, 1694, 1841..."
19603,AG7ULZ7GACZ675QL2YVIG5XUQWIA,B07X5X5KF9,0.0,1637531154519,Video Games,WB Games Middle Earth: Shadow of Mordor - Play...,"[Exploit the individual fears, weakness and me...","[Video Games, PlayStation 4, Games]",20.6,2378,1402,"[-1, -1, -1, -1, -1, 1433, 1054, 145, 1941, 684]"


# Checks

In [23]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(args.timestamp_col)
assert check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2, "Number of pos and neg samples are not equal"

[32m2024-09-20 18:59:11.119[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AE2KSKDHIBIBGNZNOUPVPZI4DEOQ...[0m


In [24]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(args.timestamp_col)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5, f"Item {item} does not appear much in training data"

[32m2024-09-20 18:59:11.127[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mChecking item B0C1K1R6HK...[0m


In [25]:
 train_neg_df.loc[lambda df: df[args.item_col].eq(item)]

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,user_indice,item_indice,item_sequence
84456,AHJ243BPWP65F24SQX3IQMZPAFOA,B0C1K1R6HK,5.0,1621630173349,Video Games,Xbox Series X,"[Xbox Series X, the fastest, most powerful Xbo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",499.99,859,2170,"[-1.0, -1.0, 2276.0, 2001.0, 1199.0, 1889.0, 5..."
83159,AEXACKKK7UNHFDS62LE3WDKVA4FA,B0C1K1R6HK,5.0,1605450248513,Video Games,Xbox Series X,"[Xbox Series X, the fastest, most powerful Xbo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",499.99,10226,2170,"[-1.0, -1.0, -1.0, -1.0, -1.0, 725.0, 846.0, 9..."
19444,AHNOPRT3BALKSHWPMSPHUHFOHB3Q,B0C1K1R6HK,0.0,1445796138000,Video Games,Xbox Series X,"[Xbox Series X, the fastest, most powerful Xbo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",499.99,2359,2170,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
84783,AFCWL7J5R6VS4GBYURIXIU23XX4A,B0C1K1R6HK,5.0,1627587111285,Video Games,Xbox Series X,"[Xbox Series X, the fastest, most powerful Xbo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",499.99,9113,2170,"[-1.0, -1.0, -1.0, -1.0, -1.0, 1144.0, 1883.0,..."
46135,AHBDPKG2DEZVHHYUCNOVIT6KVMOA,B0C1K1R6HK,0.0,1423797587000,Video Games,Xbox Series X,"[Xbox Series X, the fastest, most powerful Xbo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",499.99,5614,2170,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
25706,AEE6B6LIXEASMXHKF3N3KRUVLT2Q,B0C1K1R6HK,0.0,1484201153000,Video Games,Xbox Series X,"[Xbox Series X, the fastest, most powerful Xbo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",499.99,3146,2170,"[-1.0, -1.0, -1.0, -1.0, 2612.0, 622.0, 1765.0..."
83135,AHWOOMMH6ANSCGGE6DBXA6UNSFQA,B0C1K1R6HK,5.0,1605171470409,Video Games,Xbox Series X,"[Xbox Series X, the fastest, most powerful Xbo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",499.99,762,2170,"[-1.0, -1.0, -1.0, -1.0, -1.0, 2747.0, 1609.0,..."
83254,AFDVIJEEJDZUCI4RXDZG7TW26PSQ,B0C1K1R6HK,5.0,1606531743236,Video Games,Xbox Series X,"[Xbox Series X, the fastest, most powerful Xbo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",499.99,4009,2170,"[1299.0, 1778.0, 837.0, 2074.0, 288.0, 2240.0,..."
84067,AG2ZJGDNF5DR2KOL6BSGP53ROV6A,B0C1K1R6HK,0.0,1422330707000,Video Games,Xbox Series X,"[Xbox Series X, the fastest, most powerful Xbo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",499.99,10233,2170,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
63301,AFVE4QDLV4RN4I3M34KM7MBDRPLA,B0C1K1R6HK,0.0,1477910501000,Video Games,Xbox Series X,"[Xbox Series X, the fastest, most powerful Xbo...","[Video Games, Legacy Systems, Xbox Systems, Xb...",499.99,7688,2170,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


# Persist

In [26]:
full_df.to_parquet('../data/full_features_neg_sampling_df.parquet', index=False)

In [27]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)