In [1]:
import os
import pandas as pd
import numpy as np
from loguru import logger
from collections import defaultdict
from pydantic import BaseModel

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    experiment_name: str = "FSDS RecSys - L5 - Reco Algo"
    run_name: str = '055-30k-users'
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = 'user_id'
    item_col: str = 'parent_asin'
    rating_col: str = 'rating'
    timestamp_col: str = 'timestamp'

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self
    
args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "experiment_name": "FSDS RecSys - L5 - Reco Algo",
  "run_name": "055-30k-users",
  "notebook_persist_dp": "/Users/dvq/frostmourne/reco-algo/notebooks/data/055-30k-users",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(interactions, columns=['user_id', 'item_id', 'rating', 'timestamp'])

In [4]:
def generate_negative_samples(
    df,
    user_col='user_id',
    item_col='item_id',
    label_col='rating',
    neg_label=0,
    seed=None,
    progress_bar_type='tqdm'  # Options: 'tqdm', 'tqdm_notebook', None
):
    """
    Optimized function to generate negative samples for a user-item interaction DataFrame.
    """
    
    # Handle random seed
    if seed is not None:
        np.random.seed(seed)
    
    # Import tqdm based on the progress_bar_type
    if progress_bar_type == 'tqdm':
        try:
            from tqdm import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm is not installed. Please install it using 'pip install tqdm'.")
    elif progress_bar_type == 'tqdm_notebook':
        try:
            from tqdm.notebook import tqdm
            tqdm_bar = tqdm
        except ImportError:
            raise ImportError("tqdm.notebook is not available. Please install it using 'pip install tqdm'.")
    elif progress_bar_type is None:
        # Define a dummy tqdm function that does nothing
        def tqdm_bar(iterable, **kwargs):
            return iterable
    else:
        raise ValueError("Invalid progress_bar_type. Choose 'tqdm', 'tqdm_notebook', or None.")
    
    # Calculate item popularity based on the number of interactions
    item_popularity = df[item_col].value_counts()
    
    # Define all unique items from the DataFrame
    items = item_popularity.index.values
    all_items_set = set(items)
    
    # Create a user-item interaction dictionary
    user_item_dict = df.groupby(user_col)[item_col].apply(set).to_dict()
    
    # Prepare items list and corresponding popularity array
    popularity = item_popularity.values.astype(np.float64)
    
    # Calculate sampling probabilities based on item popularity
    total_popularity = popularity.sum()
    if total_popularity == 0:
        sampling_probs = np.ones(len(items)) / len(items)
    else:
        sampling_probs = popularity / total_popularity
    
    # Create item to index mapping for quick access
    item_to_index = {item: idx for idx, item in enumerate(items)}
    
    # Initialize a list to store negative samples
    negative_samples = []
    
    # Initialize the progress bar
    total_users = len(user_item_dict)
    progress_bar = tqdm_bar(user_item_dict.items(), total=total_users, desc="Generating Negative Samples")
    
    for user, pos_items in progress_bar:
        num_pos = len(pos_items)
        
        # Identify items not interacted with by the user
        negative_candidates = all_items_set - pos_items
        num_neg_candidates = len(negative_candidates)
        
        if num_neg_candidates == 0:
            # User has interacted with all items, skip negative sampling
            continue
        
        # Determine the number of negative samples to generate
        num_neg = min(num_pos, num_neg_candidates)
        
        # Convert set to list for indexing
        negative_candidates_list = list(negative_candidates)
        
        # Get the indices and probabilities of negative candidates
        candidate_indices = [item_to_index[item] for item in negative_candidates_list]
        candidate_probs = sampling_probs[candidate_indices]
        candidate_probs /= candidate_probs.sum()
        
        # Sample negative items without replacement
        sampled_items = np.random.choice(
            negative_candidates_list, size=num_neg, replace=False, p=candidate_probs
        )
        
        # Append the sampled negative items to the list
        negative_samples.extend([(user, item) for item in sampled_items])
    
    # Convert negative samples to a DataFrame
    df_negative = pd.DataFrame(negative_samples, columns=[user_col, item_col])
    df_negative[label_col] = neg_label  # Assign label for negative samples
    
    return df_negative

def add_features_to_neg_df(pos_df, neg_df, user_col, timestamp_col, feature_cols=[]):
    neg_df = neg_df.assign(
        timestamp_pseudo=lambda df: df.groupby('user_id').cumcount() + 1
    )
    neg_df = (
        pd.merge(
            neg_df,
            pos_df.assign(
                timestamp_pseudo=lambda df: df.groupby([user_col])[timestamp_col].rank(method='first')   
            )[[user_col, timestamp_col, 'timestamp_pseudo', *feature_cols]],
            how='left',
            on=[user_col, 'timestamp_pseudo']
        )
        .drop(columns=['timestamp_pseudo'])
    )
    return neg_df

In [5]:
neg_df = generate_negative_samples(df, progress_bar_type='tqdm_notebook')
neg_df = add_features_to_neg_df(df, neg_df, 'user_id', 'timestamp')

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
neg_df.sort_values(['user_id', 'rating'])

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,104,0,1
1,1,105,0,2
2,1,106,0,4
3,2,105,0,1
4,2,106,0,2
5,3,102,0,1
6,3,103,0,5


# Load data

In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [8]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2024-09-20 13:25:05.453[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=np.int64(1628643301628)[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AH7OMXSRNKMM3GF6PQGHQEU4XYAQ,0449208281,5.0,854697682000,4627,31034,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AEVS5LDDBLZXXNAP66SBQFY5ZY3A,038097505X,5.0,871307181000,22546,193195,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AEVS5LDDBLZXXNAP66SBQFY5ZY3A,0345311809,5.0,873311379000,22546,1640,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AEVS5LDDBLZXXNAP66SBQFY5ZY3A,014014739X,5.0,876331947000,22546,20797,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,AE6FP5GZNTBK6QKAGXFEDIJYO6MA,0446343552,5.0,876423124000,25431,183951,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
...,...,...,...,...,...,...,...
36437,AGYS63VGRKADXYMC57GEP6MG6W7Q,164845044X,5.0,1657997738446,13970,59763,"[51819, 123868, 37386, 85668, 146722, 3008, 16..."
36438,AGNG7EAGWLOJPLH3AXKSJYFQIMJA,B002HMJZAA,5.0,1657997929759,23545,152753,"[88586, 79531, 190768, 92009, 78222, 52838, 13..."
36439,AFG6YQ3GOY7TVFKQ3SKDVS6Q6RDQ,B07R3QYGHY,4.0,1657998389024,20880,152734,"[-1, -1, -1, -1, 24006, 136264, 49218, 6292, 8..."
36440,AHNN7AG7AL5Z7ZTX3ES5A4ZOQWUA,B01D1LNYWK,5.0,1657999964843,14334,162235,"[-1, -1, -1, -1, -1, -1, 76258, 18569, 643, 17..."


In [10]:
neg_df = generate_negative_samples(full_df, args.user_col, args.item_col, args.rating_col, neg_label=0, seed=args.random_seed, progress_bar_type='tqdm_notebook')
features = ["user_indice", "item_indice", "item_sequence"]
neg_ts_df = add_features_to_neg_df(full_df, neg_df, args.user_col, args.timestamp_col, features)
neg_ts_df

Generating Negative Samples:   0%|          | 0/30000 [00:00<?, ?it/s]

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AE222H3FGXWLHRFUMGMS2RR57NDQ,006156284X,0,1381407521000,26431,194991,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AE222H3FGXWLHRFUMGMS2RR57NDQ,0380698439,0,1383619827000,26431,133527,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AE222H3FGXWLHRFUMGMS2RR57NDQ,1338108921,0,1402425068000,26431,19682,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AE222H3FGXWLHRFUMGMS2RR57NDQ,1451635087,0,1407153130000,26431,79476,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 194..."
4,AE222H3FGXWLHRFUMGMS2RR57NDQ,B08PMJ754Z,0,1430941166000,26431,155259,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 194991.0,..."
...,...,...,...,...,...,...,...
493076,AHZZVLUJPBUZNQHJPA63YOVEUSTQ,1423146727,0,1638997299651,27525,192112,"[93340, 56303, 171848, 148889, 43257, 122795, ..."
493077,AHZZVLUJPBUZNQHJPA63YOVEUSTQ,0345319656,0,1642305182157,27525,95940,"[56303, 171848, 148889, 43257, 122795, 166400,..."
493078,AHZZVLUJPBUZNQHJPA63YOVEUSTQ,1607056534,0,1645736404588,27525,80846,"[171848, 148889, 43257, 122795, 166400, 132619..."
493079,AHZZVLUJPBUZNQHJPA63YOVEUSTQ,0806142669,0,1651782370727,27525,93132,"[148889, 43257, 122795, 166400, 132619, 56389,..."


In [11]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(frac=1, replace=False, random_state=args.random_seed)

In [12]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
176249,AEAOR7IVNZTLAXQUUA6LYUSQ24MQ,1682304302,5.0,1468841055000,14505,114822,"[-1.0, -1.0, 126052.0, 37548.0, 167590.0, 9967..."
64320,AGJP4HM7Z5AWUWZJSQRVGUZZ4S2A,0486243575,2.0,1366144099000,2558,67689,"[165589.0, 99507.0, 74725.0, 170827.0, 105490...."
150123,AGO6NNQQB2A5GZV2Q6HSNGI4Z2YQ,B0118KBFW2,5.0,1448913151000,20795,2825,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
283040,AGETC2KF5CHVA33FN5P33GH67EEQ,0800736478,0.0,1572579530965,14974,122858,"[159848.0, 173033.0, 30692.0, 171682.0, 86133...."
307366,AE3PJAYJGXWYMUJ427PP4IM44I5A,1948677164,4.0,1555177119487,21067,187670,"[39873.0, 162910.0, 37645.0, 107142.0, 150220...."
...,...,...,...,...,...,...,...
250960,AFAV3FFS33XTWQJAJHH4ELXTH7BQ,1101997230,5.0,1521117904352,27010,134345,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
248265,AG3XM7WQQXETKECJJKZ5TLSFAVEA,0545153530,0.0,1582292967012,26000,175179,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
354675,AGXVLUWYA4HBQRG6GPIAM3NGJNJA,B0721QKKVD,0.0,1543295501971,7433,192490,"[112661.0, 70523.0, 51377.0, 119828.0, 12492.0..."
163210,AFF4L7N55E3G4QYTTI2FRIRMYSYA,1940941717,0.0,1643391149968,15502,134157,"[153373, 163647, 7392, 11930, 84546, 1217, 680..."


In [13]:
full_df.to_parquet('../data/full_features_neg_sampling_df.parquet', index=False)

In [14]:
val_timestamp

np.int64(1628643301628)

In [15]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [16]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)

In [17]:
full_df.loc[lambda df: df['user_id'].eq('AEYYFUHPXZHZXW2NIDV723D5LNZQ')].sort_values('timestamp')

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
115362,AEYYFUHPXZHZXW2NIDV723D5LNZQ,0997431520,0.0,1571063783442,19208,72639,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
336734,AEYYFUHPXZHZXW2NIDV723D5LNZQ,0142405965,5.0,1571063783442,19208,72639,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
115363,AEYYFUHPXZHZXW2NIDV723D5LNZQ,1250777887,0.0,1598565454026,19208,194846,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
385916,AEYYFUHPXZHZXW2NIDV723D5LNZQ,0991243560,5.0,1598565454026,19208,194846,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
115364,AEYYFUHPXZHZXW2NIDV723D5LNZQ,B0786WP84L,0.0,1601576819114,19208,164149,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
392519,AEYYFUHPXZHZXW2NIDV723D5LNZQ,1524855154,5.0,1601576819114,19208,164149,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
392520,AEYYFUHPXZHZXW2NIDV723D5LNZQ,0062861867,5.0,1601576901716,19208,90068,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 726..."
115365,AEYYFUHPXZHZXW2NIDV723D5LNZQ,B002YEQZXQ,0.0,1601576901716,19208,90068,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 726..."
412342,AEYYFUHPXZHZXW2NIDV723D5LNZQ,1524744603,5.0,1611182168373,19208,184161,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 72639.0, ..."
115366,AEYYFUHPXZHZXW2NIDV723D5LNZQ,0425235521,0.0,1611182168373,19208,184161,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 72639.0, ..."
