# Data prep

In [3]:
import os
import sys

import pandas as pd
import numpy as np
from loguru import logger
from pydantic import BaseModel
from tqdm import tqdm
sys.path.insert(0, "..")

In [4]:
class Args(BaseModel):
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

args = Args()

print(args.model_dump_json(indent=2))

{
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


## Negative sampling

In [5]:
train_df = pd.read_parquet("../data_for_ai/interim/train_sample_interactions_16407u.parquet")
val_df = pd.read_parquet("../data_for_ai/interim/val_sample_interactions_16407u.parquet")
train_df.head(3)

Unnamed: 0,user_id,parent_asin,rating,timestamp
3194,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B06XKCPK5W,2.0,2012-06-11 16:41:10
3199,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B000CKVOOY,3.0,2012-08-02 02:04:13
3200,AEYGPUCRKH7G4VM22FM3VAKSQ23Q,B006GWO5WK,5.0,2012-09-15 16:34:46


In [22]:
# Test data
train_df = pd.DataFrame(
    {
        args.user_col: [1, 2, 3, 1],
        args.item_col: [4, 5, 6, 7],
        args.rating_col: [1, 1, 1, 5],
        args.timestamp_col: [1, 2, 3, 4],
        'point_int_time_feat': ["Goat M10", "hehe2", "hehe3", "new_value"],

    }
)

In [23]:
# Groupby user to get the set of item that each user has interacted with
interacted = train_df.groupby(args.user_col)[args.item_col].apply(set).to_dict()

In [24]:
# Groupby item to get the popularity of each item
item_popularity = train_df.groupby(args.item_col)[args.user_col].count()

# Alternative way
item_popularity = train_df[args.item_col].value_counts()

In [25]:
items = item_popularity.index.values
items_set = set(items)
popularity = item_popularity.values.astype(np.float64)

In [26]:
sampling_probs = popularity / np.sum(popularity)
sampling_probs

array([0.25, 0.25, 0.25, 0.25])

In [27]:
# Build up item2idx mapping
item_to_idx = {item: idx for idx, item in enumerate(items)}

In [28]:
print(interacted)

{1: {4, 7}, 2: {5}, 3: {6}}


In [57]:
# Get negative samples for each row in order to get the corresponding point in time features
tqdm.pandas(desc="Generating negative samples")
negative_samples = []
ratio = 2
features = []

def generate_negative_samples(row):
    user_id = row[args.user_col]

    pos_items = interacted[user_id]
    
    negative_candidates = items_set  - pos_items
    num_negative_candidates = len(negative_candidates)

    if num_negative_candidates ==0:
        print(f"User {user_id} has interacted with all items, skipping...")
        return []
    
    num_negative = min(ratio, num_negative_candidates)

    negative_candidates_list = list(negative_candidates)

    negative_candidates_list_idx = [item_to_idx[item] for item in negative_candidates_list]
    negative_candidates_sampling_probs = sampling_probs[negative_candidates_list_idx]
    negative_candidates_sampling_probs /= np.sum(negative_candidates_sampling_probs)

    negative_items = np.random.choice(
        negative_candidates_list,
        size=num_negative,
        replace=False,
        p=negative_candidates_sampling_probs
    )
    return negative_items


df_negative = (train_df.assign(
    negative_sample= lambda df: df.progress_apply(generate_negative_samples, axis=1))
    .explode("negative_sample")
    .drop([args.item_col], axis = 1)
    .rename(columns={"negative_sample": args.item_col})
    .assign(rating=0)
    [[args.timestamp_col, args.user_col, args.item_col, args.rating_col, *features]]
)

Generating negative samples: 100%|██████████| 4/4 [00:00<00:00, 4943.20it/s]


In [58]:
# negative_samples = []
# ratio = 1
# for user, pos_items in tqdm(interacted.items()):
#     num_pos = len(pos_items)

#     negative_candidates = items_set - pos_items
    
#     num_negative_candidates = len(negative_candidates)

#     if num_negative_candidates ==0:
#         print(f"User {user} has interacted with all items, skipping...")
#         continue

#     num_negative = min(num_pos * ratio, num_negative_candidates)

#     negative_candidates_list = list(negative_candidates)

#     negative_candidates_list_idx = [item_to_idx[item] for item in negative_candidates_list]
#     negative_candidates_sampling_probs = sampling_probs[negative_candidates_list_idx]
#     negative_candidates_sampling_probs /= np.sum(negative_candidates_sampling_probs)
    
#     negative_items = np.random.choice(
#         negative_candidates_list,
#         size=num_negative,
#         replace=False,
#         p=negative_candidates_sampling_probs,
#     )

#     negative_samples.extend([(user, item) for item in negative_items])

In [59]:
# df_negative = pd.DataFrame(negative_samples, columns=[args.user_col, args.item_col])
# df_negative["rating"] = 0

In [64]:
df_negative

Unnamed: 0,timestamp,user_id,parent_asin,rating
0,1,1,6,0
0,1,1,5,0
1,2,2,7,0
1,2,2,4,0
2,3,3,7,0
2,3,3,5,0
3,4,1,6,0
3,4,1,5,0


In [33]:
# # Join with positive df
# df_negative = df_negative.assign(
#     timestamp_pseudo = lambda df: df.groupby(args.user_col).cumcount() + 1,
# )
# df_negative

In [63]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,point_int_time_feat
0,1,4,1,1,Goat M10
1,2,5,1,2,hehe2
2,3,6,1,3,hehe3
3,1,7,5,4,new_value


In [61]:
pd.merge(
    df_negative,
    train_df,
    how="left",
    on=[args.user_col, args.item_col, args.timestamp_col],
    validate="m:m"
)

Unnamed: 0,timestamp,user_id,parent_asin,rating_x,rating_y,point_int_time_feat
0,1,1,6,0,,
1,1,1,5,0,,
2,2,2,7,0,,
3,2,2,4,0,,
4,3,3,7,0,,
5,3,3,5,0,,
6,4,1,6,0,,
7,4,1,5,0,,
