# Negative sampling


More label data can tremendously help model to learn. In this notebook we would prepare negative samples for each user based on the unseen items. The unseen items are sampled based on how frequent they appear in the training dataset. This is an intentional choice to make the negative sample harder, hence potentially more useful. This would also force the model learn relevant patterns about user behaviors rather than biased by popularity.


# Set up


In [None]:
import sys

import pandas as pd
from loguru import logger

sys.path.insert(0, "..")
from src.cfg import ConfigLoader
from src.negative_sampling import add_features_to_neg_df, generate_negative_samples

# Controller


In [None]:
cfg = ConfigLoader("../cfg/common.yaml")
cfg

{
  "random_seed": 41,
  "root_dir": "/home/dvq/frostmourne/recsys-blog/1-seq-model",
  "data": {
    "hf_datasets": {
      "name": "McAuley-Lab/Amazon-Reviews-2023",
      "mcauley_variant": "Books"
    },
    "train_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/train.parquet",
    "val_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/val.parquet",
    "idm_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/idm.json",
    "metadata_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/metadata.parquet",
    "train_features_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/train_features.parquet",
    "val_features_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/val_features.parquet",
    "full_features_neg_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/full_features_neg_sampling_df.parquet",
    "train_features_neg_fp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/data/train_features_neg_df.parquet",
    "val_features_neg_fp": "

# Test implementation


In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", cfg.data.rating_col, cfg.data.timestamp_col],
)

In [4]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df)
neg_df = add_features_to_neg_df(df, neg_df, "user_indice", "timestamp")

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
neg_df.sort_values(["user_indice", cfg.data.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,105,0,1
1,1,104,0,2
2,1,106,0,4
3,2,106,0,1
4,2,105,0,2
5,3,101,0,1
6,3,102,0,5


# Load data


In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [None]:
assert (
    val_df[cfg.data.timestamp_col].min() - train_df[cfg.data.timestamp_col].max()
) > 0
val_timestamp = train_df[cfg.data.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2025-03-09 15:36:54.700[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=1628641686215[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AE224PFXAEAT66IXX43GRJSWHXCA,0399159312,2.0,1373291889000,0,1251,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AE224PFXAEAT66IXX43GRJSWHXCA,B000FA5TTW,1.0,1382077065000,0,3363,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AE224PFXAEAT66IXX43GRJSWHXCA,030758836X,1.0,1424138603000,0,499,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AE224PFXAEAT66IXX43GRJSWHXCA,B00MSRW6SM,4.0,1437924147000,0,5410,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 125..."
4,AE224PFXAEAT66IXX43GRJSWHXCA,B00A18VD7A,1.0,1464603674000,0,4639,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1251.0, 3..."
...,...,...,...,...,...,...,...
3565,AHZLM4RDKSICEFEAYEQQRZW45BPA,B08CV9SPDQ,5.0,1657516186943,19672,7335,"[-1, -1, -1, -1, 6121, 5345, 5735, 6479, 6758,..."
3566,AHZLQPSPG675BABC5R5NJW6KG3WQ,B07D6PZ6P1,5.0,1635813519329,19673,6837,"[-1, -1, -1, -1, -1, 4387, 4340, 4578, 6829, 7..."
3567,AHZNQ34GWKKLJN53IDXLAX22OBJQ,B07ZJ2VHBB,5.0,1652978096579,19682,7208,"[-1, -1, -1, -1, -1, 4195, 7126, 7175, 6817, 6..."
3568,AHZNQ34GWKKLJN53IDXLAX22OBJQ,B00PG8UCGS,5.0,1654707732874,19682,5521,"[-1, -1, -1, -1, 4195, 7126, 7175, 6817, 6807,..."


In [10]:
features = ["item_sequence", "user_id"]

neg_df = generate_negative_samples(
    full_df,
    "user_indice",
    "item_indice",
    cfg.data.rating_col,
    neg_label=0,
    seed=cfg.run.random_seed,
)
neg_ts_df = add_features_to_neg_df(
    full_df, neg_df, "user_indice", cfg.data.timestamp_col, features
)
neg_ts_df

Generating Negative Samples:   0%|          | 0/19734 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,2679,0,1373291889000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AE224PFXAEAT66IXX43GRJSWHXCA
1,0,496,0,1382077065000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AE224PFXAEAT66IXX43GRJSWHXCA
2,0,5550,0,1424138603000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AE224PFXAEAT66IXX43GRJSWHXCA
3,0,476,0,1437924147000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 125...",AE224PFXAEAT66IXX43GRJSWHXCA
4,0,1231,0,1464603674000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1251.0, 3...",AE224PFXAEAT66IXX43GRJSWHXCA
...,...,...,...,...,...,...
197989,19733,1024,0,1508089337653,"[-1.0, -1.0, -1.0, -1.0, -1.0, 5149.0, 4095.0,...",AHZZZ6UASY7CGOTGP5BH5637FMPA
197990,19733,21,0,1521230143557,"[-1.0, -1.0, -1.0, -1.0, 5149.0, 4095.0, 6073....",AHZZZ6UASY7CGOTGP5BH5637FMPA
197991,19733,1278,0,1534867184329,"[-1.0, -1.0, -1.0, 5149.0, 4095.0, 6073.0, 582...",AHZZZ6UASY7CGOTGP5BH5637FMPA
197992,19733,4990,0,1534867223318,"[-1.0, -1.0, 5149.0, 4095.0, 6073.0, 5821.0, 6...",AHZZZ6UASY7CGOTGP5BH5637FMPA


# Join with features

Populate the feature values for the newly created negative samples


In [11]:
# Item features

not_item_feature_cols = (
    cfg.data.user_col,
    "user_indice",
    "item_indice",
    cfg.data.rating_col,
    cfg.data.timestamp_col,
    *features,
)
item_features = [col for col in full_df.columns if col not in not_item_feature_cols]
item_features

['parent_asin']

In [12]:
neg_ts_df = pd.merge(
    neg_ts_df,
    full_df[["item_indice", *item_features]].drop_duplicates(subset=["item_indice"]),
    how="left",
    on=["item_indice"],
    validate="m:1",
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin
0,0,2679,0,1373291889000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AE224PFXAEAT66IXX43GRJSWHXCA,1401324649
1,0,496,0,1382077065000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AE224PFXAEAT66IXX43GRJSWHXCA,0307476073
2,0,5550,0,1424138603000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AE224PFXAEAT66IXX43GRJSWHXCA,B00R04GCMY
3,0,476,0,1437924147000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 125...",AE224PFXAEAT66IXX43GRJSWHXCA,0307352145
4,0,1231,0,1464603674000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1251.0, 3...",AE224PFXAEAT66IXX43GRJSWHXCA,0399155988
...,...,...,...,...,...,...,...
197989,19733,1024,0,1508089337653,"[-1.0, -1.0, -1.0, -1.0, -1.0, 5149.0, 4095.0,...",AHZZZ6UASY7CGOTGP5BH5637FMPA,038533348X
197990,19733,21,0,1521230143557,"[-1.0, -1.0, -1.0, -1.0, 5149.0, 4095.0, 6073....",AHZZZ6UASY7CGOTGP5BH5637FMPA,0060817089
197991,19733,1278,0,1534867184329,"[-1.0, -1.0, -1.0, 5149.0, 4095.0, 6073.0, 582...",AHZZZ6UASY7CGOTGP5BH5637FMPA,0399171614
197992,19733,4990,0,1534867223318,"[-1.0, -1.0, 5149.0, 4095.0, 6073.0, 5821.0, 6...",AHZZZ6UASY7CGOTGP5BH5637FMPA,B00EXTQRN8


In [None]:
item = neg_ts_df.sample(n=1)[cfg.data.item_col].values[0]
logger.info(f"Testing mapping item_indice and {cfg.data.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[cfg.data.item_col].eq(item)][
    "item_indice"
]
assert len(set(neg_item_indices)) == 1, (
    f"Mismatch {cfg.data.item_col} and item_indice in new neg_ts_df"
)
original_item_indices = full_df.loc[lambda df: df[cfg.data.item_col].eq(item)][
    "item_indice"
]
assert len(set(original_item_indices)) == 1, (
    f"Mismatch {cfg.data.item_col} and item_indice at original df"
)
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2025-03-09 15:37:07.022[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B078M5HQBX...[0m


# Concating positive data with negative samples


In [14]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(
    frac=1, replace=False, random_state=cfg.run.random_seed
)

In [15]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
34842,AEQ2VN26PJWEJLPY3JDCUCAISBWA,B001NLKV0O,0.0,1429791287000,3422,3737,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
130704,AGPAMWVXLFFWCBO3J4JNOYBB5CJQ,B01CXE9Q8C,4.0,1520074315327,13278,6054,"[-1.0, -1.0, -1.0, -1.0, -1.0, 3466.0, 6104.0,..."
155807,AH67WNBDE2OJZWD7UYFEBVKPQVBA,B074LBHRHQ,0.0,1625237203135,15577,6577,"[6744.0, 6808.0, 4236.0, 4838.0, 6050.0, 6454...."
17264,AEEQ76GEEJNQGYLBWII7HMALA7UQ,0143038419,0.0,1490819684000,1684,392,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
131576,AGOEVMJWD5O6VFUZLIIUJDV7TG4A,B01LZZGJ2C,0.0,1541862880051,13127,6254,"[-1.0, -1.0, -1.0, -1.0, -1.0, 4710.0, 5225.0,..."
...,...,...,...,...,...,...,...
52966,AF3Y5JZBLG56FLUVJDGOWGDDIF2Q,B00CGZXQDU,0.0,1405652794000,5277,4842,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
19064,AEFZDTMPTPRMIYLY3LRYMBODU55A,0394800915,0.0,1444418720000,1867,1220,"[-1.0, -1.0, -1.0, -1.0, -1.0, 350.0, 2324.0, ..."
125474,AGKCP4L7I3ZWONKYFAMP4L3BIKVQ,B004SHF18M,0.0,1531794865327,12534,4133,"[3411.0, 6378.0, 5233.0, 6725.0, 6769.0, 6750...."
132003,AGQ3G5TPEQV5AF4UPHKKCPK4C27Q,B00AEDDSZW,4.0,1381858732000,13407,4669,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


In [16]:
key_cols = [
    cfg.data.user_col,
    cfg.data.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    cfg.data.rating_col,
    cfg.data.timestamp_col,
]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [17]:
val_timestamp

1628641686215

# Split back train test


In [18]:
train_neg_df = full_df.loc[lambda df: df[cfg.data.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[cfg.data.timestamp_col].ge(val_timestamp)]

In [19]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
34842,AEQ2VN26PJWEJLPY3JDCUCAISBWA,B001NLKV0O,0.0,1429791287000,3422,3737,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
130704,AGPAMWVXLFFWCBO3J4JNOYBB5CJQ,B01CXE9Q8C,4.0,1520074315327,13278,6054,"[-1.0, -1.0, -1.0, -1.0, -1.0, 3466.0, 6104.0,..."
155807,AH67WNBDE2OJZWD7UYFEBVKPQVBA,B074LBHRHQ,0.0,1625237203135,15577,6577,"[6744.0, 6808.0, 4236.0, 4838.0, 6050.0, 6454...."
17264,AEEQ76GEEJNQGYLBWII7HMALA7UQ,0143038419,0.0,1490819684000,1684,392,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
131576,AGOEVMJWD5O6VFUZLIIUJDV7TG4A,B01LZZGJ2C,0.0,1541862880051,13127,6254,"[-1.0, -1.0, -1.0, -1.0, -1.0, 4710.0, 5225.0,..."
...,...,...,...,...,...,...,...
52966,AF3Y5JZBLG56FLUVJDGOWGDDIF2Q,B00CGZXQDU,0.0,1405652794000,5277,4842,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
19064,AEFZDTMPTPRMIYLY3LRYMBODU55A,0394800915,0.0,1444418720000,1867,1220,"[-1.0, -1.0, -1.0, -1.0, -1.0, 350.0, 2324.0, ..."
125474,AGKCP4L7I3ZWONKYFAMP4L3BIKVQ,B004SHF18M,0.0,1531794865327,12534,4133,"[3411.0, 6378.0, 5233.0, 6725.0, 6769.0, 6750...."
132003,AGQ3G5TPEQV5AF4UPHKKCPK4C27Q,B00AEDDSZW,4.0,1381858732000,13407,4669,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


In [20]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AE23RLRV25THT7OZM4T4ZJ4BMYCA,0062409212,5.0,1646772001708,12,265,"[-1, -1, -1, -1, -1, 4136, 3643, 4566, 6057, 6..."
460,AEKQREM4SYKU6HK2CSMWOYXNMCWQ,B00EA8EO00,5.0,1650134360640,2600,4956,"[-1, -1, -1, -1, 4309, 7318, 7373, 7000, 7377,..."
71,AE4YFHNIXVBWI2V4DRU5UFSTY4RQ,B08ZM7BQ5J,4.0,1630002488344,469,7380,"[5058, 5229, 2515, 1277, 1370, 1371, 6978, 698..."
51616,AF32NMJLLXFWLS6VNKOJQU2YIZFA,B000FCKIFU,0.0,1647485428978,5127,3478,"[7348, 7360, 7376, 6787, 5710, 7339, 6657, 676..."
181832,AHPDMWKQJAGCVC22GSGIU3YANSJA,B0010SKUG0,0.0,1629083240553,18127,3659,"[-1, -1, -1, -1, -1, 6455, 7070, 4669, 6955, 7..."
...,...,...,...,...,...,...,...
2643,AH25V4CIGALTU2ARMKENS4HWD76A,B07DNDY87J,5.0,1631661207397,14899,6854,"[5260, 6108, 6172, 5967, 6069, 3849, 6201, 404..."
139333,AGT55MLCXTB5AUDMC7MGG2FVUEGQ,B00AFQC4QC,0.0,1636910759987,13878,4675,"[5133, 6862, 4328, 4147, 5715, 6659, 5966, 502..."
325,AEFQQOMB5AYGR2FVV6X5OP5Y7VTQ,B00KIZQG96,5.0,1657081631273,1831,5301,"[5155, 5810, 4711, 5130, 4425, 5718, 6407, 617..."
2214,AGKILHN37242OQLPSMAMMZJA6IAQ,B07CRC52VH,3.0,1653587733450,12555,6808,"[6445, 6255, 5255, 5632, 6501, 7192, 6838, 727..."


# Checks


In [None]:
user = val_neg_df.sample(n=1)[cfg.data.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[cfg.data.user_col].eq(user)].sort_values(
    cfg.data.timestamp_col
)
assert check_df[cfg.data.rating_col].gt(0).sum() == check_df.shape[0] / 2, (
    "Number of pos and neg samples are not equal"
)

[32m2025-03-09 15:37:07.191[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AH2M44UAXGS7LL6WF72NTETJDJTQ...[0m


In [None]:
val_check_df = val_neg_df.loc[lambda df: df[cfg.data.user_col].eq(user)].sort_values(
    cfg.data.timestamp_col
)
item = val_check_df.loc[lambda df: df[cfg.data.rating_col].gt(0)][
    cfg.data.item_col
].values[0]
logger.info(f"Checking item {item}...")
assert train_neg_df.loc[lambda df: df[cfg.data.item_col].eq(item)].shape[0] > 5, (
    f"Item {item} does not appear much in training data"
)

[32m2025-03-09 15:37:07.206[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B07CZHYBBP...[0m


## Random eye-ball


In [23]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
149727,AH2M44UAXGS7LL6WF72NTETJDJTQ,B01C2GFP5Y,0.0,1412694841000,14986,6033,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
147040,AH2M44UAXGS7LL6WF72NTETJDJTQ,B0052RDHTM,5.0,1412694841000,14986,4188,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
147041,AH2M44UAXGS7LL6WF72NTETJDJTQ,141971189X,5.0,1418762348000,14986,2743,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
149728,AH2M44UAXGS7LL6WF72NTETJDJTQ,B007ZFIOUG,0.0,1418762348000,14986,4456,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
149729,AH2M44UAXGS7LL6WF72NTETJDJTQ,B007SGLZP8,0.0,1418762369000,14986,4434,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
147042,AH2M44UAXGS7LL6WF72NTETJDJTQ,1419711326,5.0,1418762369000,14986,2742,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
147043,AH2M44UAXGS7LL6WF72NTETJDJTQ,B007FEFLTO,2.0,1420733615000,14986,4405,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 418..."
149730,AH2M44UAXGS7LL6WF72NTETJDJTQ,B00ENMXSF0,0.0,1420733615000,14986,4977,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 418..."
147044,AH2M44UAXGS7LL6WF72NTETJDJTQ,0425255735,5.0,1438612456000,14986,1356,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4188.0, 2..."
149731,AH2M44UAXGS7LL6WF72NTETJDJTQ,B00QQQL8JY,0.0,1438612456000,14986,5541,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4188.0, 2..."


In [24]:
with pd.option_context("display.max_colwidth", None):
    display(val_check_df)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
2687,AH2M44UAXGS7LL6WF72NTETJDJTQ,B07CZHYBBP,5.0,1640225033494,14986,6819,"[-1, -1, -1, -1, 4188, 2743, 2742, 4405, 1356, 483]"
149733,AH2M44UAXGS7LL6WF72NTETJDJTQ,1400096278,0.0,1640225033494,14986,2666,"[-1, -1, -1, -1, 4188, 2743, 2742, 4405, 1356, 483]"
149734,AH2M44UAXGS7LL6WF72NTETJDJTQ,B002PMVQBI,0.0,1640225090524,14986,3800,"[-1, -1, -1, 4188, 2743, 2742, 4405, 1356, 483, 6819]"
2688,AH2M44UAXGS7LL6WF72NTETJDJTQ,B078JMZHXF,5.0,1640225090524,14986,6688,"[-1, -1, -1, 4188, 2743, 2742, 4405, 1356, 483, 6819]"
149735,AH2M44UAXGS7LL6WF72NTETJDJTQ,B001QNVPAE,0.0,1640225165915,14986,3750,"[-1, -1, 4188, 2743, 2742, 4405, 1356, 483, 6819, 6688]"
2689,AH2M44UAXGS7LL6WF72NTETJDJTQ,B072QX6TG1,5.0,1640225165915,14986,6539,"[-1, -1, 4188, 2743, 2742, 4405, 1356, 483, 6819, 6688]"
149736,AH2M44UAXGS7LL6WF72NTETJDJTQ,B00GEEB52S,0.0,1640225217482,14986,5070,"[-1, 4188, 2743, 2742, 4405, 1356, 483, 6819, 6688, 6539]"
2690,AH2M44UAXGS7LL6WF72NTETJDJTQ,B06ZZW7G42,5.0,1640225217482,14986,6472,"[-1, 4188, 2743, 2742, 4405, 1356, 483, 6819, 6688, 6539]"
2691,AH2M44UAXGS7LL6WF72NTETJDJTQ,B07MK5CTJJ,1.0,1640225333640,14986,7000,"[4188, 2743, 2742, 4405, 1356, 483, 6819, 6688, 6539, 6472]"
149737,AH2M44UAXGS7LL6WF72NTETJDJTQ,B01M7XPGYE,0.0,1640225333640,14986,6281,"[4188, 2743, 2742, 4405, 1356, 483, 6819, 6688, 6539, 6472]"


# Persist


In [25]:
full_df.to_parquet(cfg.data.full_features_neg_fp, index=False)

In [26]:
train_neg_df.to_parquet(cfg.data.train_features_neg_fp, index=False)
val_neg_df.to_parquet(cfg.data.val_features_neg_fp, index=False)