In [1]:
import os
import pandas as pd
import numpy as np
from loguru import logger
from collections import defaultdict
from pydantic import BaseModel

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = '041-offline-negative-sampling-rating-prediction'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "041-offline-negative-sampling-rating-prediction",
  "notebook_persist_dp": "/home/jupyter/frostmourne/reco-algo/notebooks/data/041-offline-negative-sampling-rating-prediction",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(interactions, columns=['user_id', 'item_id', 'rating', 'timestamp'])

In [4]:
def generate_negative_samples(
    df,
    user_col='user_id',
    item_col='item_id',
    label_col='rating',
    neg_label=0,
    seed=None,
    progress_bar_type='tqdm'  # Options: 'tqdm', 'tqdm_notebook', None
):
    """
    Optimized function to generate negative samples for a user-item interaction DataFrame.
    """
    
    # Handle random seed
    if seed is not None:
        np.random.seed(seed)
    
    # Import tqdm based on the progress_bar_type
    if progress_bar_type == 'tqdm':
        try:
            from tqdm import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm is not installed. Please install it using 'pip install tqdm'.")
    elif progress_bar_type == 'tqdm_notebook':
        try:
            from tqdm.notebook import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm.notebook is not available. Please install it using 'pip install tqdm'.")
    elif progress_bar_type is None:
        # Define a dummy tqdm function that does nothing
        def tqdm_bar(iterable, **kwargs):
            return iterable
    else:
        raise ValueError("Invalid progress_bar_type. Choose 'tqdm', 'tqdm_notebook', or None.")
    
    # Calculate item popularity based on the number of interactions
    item_popularity = df[item_col].value_counts()
    
    # Define all unique items from the DataFrame
    items = item_popularity.index.values
    all_items_set = set(items)
    
    # Create a user-item interaction dictionary
    user_item_dict = df.groupby(user_col)[item_col].apply(set).to_dict()
    
    # Prepare items list and corresponding popularity array
    popularity = item_popularity.values.astype(np.float64)
    
    # Calculate sampling probabilities based on item popularity
    total_popularity = popularity.sum()
    if total_popularity == 0:
        sampling_probs = np.ones(len(items)) / len(items)
    else:
        sampling_probs = popularity / total_popularity
    
    # Create item to index mapping for quick access
    item_to_index = {item: idx for idx, item in enumerate(items)}
    
    # Initialize a list to store negative samples
    negative_samples = []
    
    # Initialize the progress bar
    total_users = len(user_item_dict)
    progress_bar = tqdm_bar(user_item_dict.items(), total=total_users, desc="Generating Negative Samples")
    
    for user, pos_items in progress_bar:
        num_pos = len(pos_items)
        
        # Identify items not interacted with by the user
        negative_candidates = all_items_set - pos_items
        num_neg_candidates = len(negative_candidates)
        
        if num_neg_candidates == 0:
            # User has interacted with all items, skip negative sampling
            continue
        
        # Determine the number of negative samples to generate
        num_neg = min(num_pos, num_neg_candidates)
        
        # Convert set to list for indexing
        negative_candidates_list = list(negative_candidates)
        
        # Get the indices and probabilities of negative candidates
        candidate_indices = [item_to_index[item] for item in negative_candidates_list]
        candidate_probs = sampling_probs[candidate_indices]
        candidate_probs /= candidate_probs.sum()
        
        # Sample negative items without replacement
        sampled_items = np.random.choice(
            negative_candidates_list, size=num_neg, replace=False, p=candidate_probs
        )
        
        # Append the sampled negative items to the list
        negative_samples.extend([(user, item) for item in sampled_items])
    
    # Convert negative samples to a DataFrame
    df_negative = pd.DataFrame(negative_samples, columns=[user_col, item_col])
    df_negative[label_col] = neg_label  # Assign label for negative samples
    
    return df_negative

def add_timestamp_to_neg_df(pos_df, neg_df, user_col, timestamp_col):
    neg_df = neg_df.assign(
        timestamp_pseudo=lambda df: df.groupby('user_id').cumcount() + 1
    )
    neg_df = (
        pd.merge(
            neg_df,
            pos_df.assign(
                timestamp_pseudo=lambda df: df.groupby([user_col])[timestamp_col].rank(method='first')   
            )[[user_col, timestamp_col, 'timestamp_pseudo']],
            how='left',
            on=[user_col, 'timestamp_pseudo']
        )
        .drop(columns=['timestamp_pseudo'])
    )
    return neg_df

In [5]:
neg_df = generate_negative_samples(df, progress_bar_type='tqdm_notebook')
neg_df = add_timestamp_to_neg_df(df, neg_df, 'user_id', 'timestamp')

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
neg_df.sort_values(['user_id', 'rating'])

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,104,0,1
1,1,106,0,2
2,1,105,0,4
3,2,105,0,1
4,2,103,0,2
5,3,104,0,1
6,3,102,0,5


# Load data

In [7]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")

In [8]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-09-19 10:03:04.456[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=np.int64(1628642557238)[0m


In [19]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
23,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,0920668372,5.0,1430056169000
24,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,1589255208,5.0,1443926150000
25,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,2764322836,5.0,1463967052000
26,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,2764330898,5.0,1489085694000
27,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,0062380761,5.0,1526591330983
...,...,...,...,...
424800,AHXZ66ATLSPVIW5HC5OTNLYGBDTQ,1416542744,4.0,1645198331443
424884,AHWBSG5WTNDC47SPUMJTWPIDZ7HQ,B08MQLJ99B,5.0,1629558239986
425194,AE5AXNZSQK6R5J2EXFUCFPDPSA6A,1643260448,2.0,1637475668742
425440,AFM4K7CAFB2KE6BHWQSS7KEHTWLA,0452282314,5.0,1643339582810


In [10]:
neg_df = generate_negative_samples(full_df, args.user_col, args.item_col, args.rating_col, neg_label=0, seed=args.random_seed, progress_bar_type='tqdm_notebook')
neg_ts_df = add_timestamp_to_neg_df(full_df, neg_df, args.user_col, args.timestamp_col)
neg_ts_df

Generating Negative Samples:   0%|          | 0/10000 [00:00<?, ?it/s]

ValueError: You are trying to merge on object and int64 columns for key 'user_id'. If you wish to proceed you should use pd.concat

In [20]:
neg_ts_df = add_timestamp_to_neg_df(full_df, neg_df, args.user_col, args.timestamp_col)
neg_ts_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AE22QFIC5SDTXPDXBANVVZI6FX3Q,1951806034,0,1454944233000
1,AE22QFIC5SDTXPDXBANVVZI6FX3Q,0143133829,0,1454944287000
2,AE22QFIC5SDTXPDXBANVVZI6FX3Q,B0045OUSV8,0,1508347362448
3,AE22QFIC5SDTXPDXBANVVZI6FX3Q,1451698070,0,1508347439290
4,AE22QFIC5SDTXPDXBANVVZI6FX3Q,0486806553,0,1572701704383
...,...,...,...,...
161728,AHZZQNSG7UUC6YE5SKKA4HMCOQUQ,B08B6CBQZK,0,1642920227166
161729,AHZZRNJYTJETXCG4D43GZB7XL5VQ,0312680694,0,1395112263000
161730,AHZZRNJYTJETXCG4D43GZB7XL5VQ,0316349259,0,1491674307000
161731,AHZZRNJYTJETXCG4D43GZB7XL5VQ,0061148857,0,1510787707469


In [21]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(frac=1, replace=False, random_state=args.random_seed)

In [22]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
5276256,AFPJDMLP7OXQDTDGGOIZJAQ3UZEA,1579549675,5.0,1407863807000
36985,AEYYFUHPXZHZXW2NIDV723D5LNZQ,B07ZN32SLD,0.0,1601576819114
1438719,AF5ZXM5M4YFJ42TXDWX2QMACA7UQ,1984804022,3.0,1563071154625
5636351,AH6PDEOGCCF6M5GG4YPY7EZ3IG6A,1984856731,3.0,1614207824775
101319,AGMGDRKZHMDZY3F7AZBFWMJY77LA,0446571415,0.0,1486650643000
...,...,...,...,...
2468252,AGHSCVY255DHIOHGJSPHV5C4BFZA,1884550738,5.0,1421871653000
89227,AGBRSFUTITDVXT47M3DT77L3HR4A,0425270823,0.0,1557605001431
55325,AFHHMLPOSP2SP3UQVYIAJGKN35QQ,157324743X,0.0,1452564272000
6582160,AESEF2B65PNCFEICWBKRXVURTJPA,1982123966,4.0,1624424699618


In [23]:
full_df.to_parquet('../data/full_df.parquet', index=False)

In [24]:
val_timestamp

np.int64(1628642557238)

In [28]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [30]:
train_neg_df.to_parquet("../data/train_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_neg_df.parquet", index=False)