# Negative sampling

More label data can tremendously help model to learn. In this notebook we would prepare negative samples for each user based on the unseen items. The unseen items are sampled based on how frequent they appear in the training dataset. This is an intentional choice to make the negative sample harder, hence potentially more useful. This would also force the model learn relevant patterns about user behaviors rather than biased by popularity.

# Set up

In [None]:
import os
import sys

import pandas as pd
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.negative_sampling import (add_features_to_neg_df,
                                   generate_negative_samples)

# Controller

In [2]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "002-negative-sample"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "002-negative-sample",
  "notebook_persist_dp": "/home/dvq/frostmourne/recsys-blog/1-seq-model/notebooks/data/002-negative-sample",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp"
}


# Test implementation

In [3]:
# Sample input: List of (user_id, item_id) interactions
interactions = [
    (1, 101, 1, 1),
    (1, 102, 2, 2),
    (1, 103, 3, 4),
    (2, 101, 4, 1),
    (2, 104, 5, 2),
    (3, 105, 1, 1),
    (3, 106, 2, 5),
    # Add more interactions as needed
]

# Convert the list to a DataFrame for easier manipulation
df = pd.DataFrame(
    interactions,
    columns=["user_indice", "item_indice", args.rating_col, args.timestamp_col],
)

In [4]:
df

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,101,1,1
1,1,102,2,2
2,1,103,3,4
3,2,101,4,1
4,2,104,5,2
5,3,105,1,1
6,3,106,2,5


In [5]:
neg_df = generate_negative_samples(df)
neg_df = add_features_to_neg_df(df, neg_df, "user_indice", "timestamp")

Generating Negative Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
neg_df.sort_values(["user_indice", args.rating_col])

Unnamed: 0,user_indice,item_indice,rating,timestamp
0,1,104,0,1
1,1,105,0,2
2,1,106,0,4
3,2,106,0,1
4,2,105,0,2
5,3,101,0,1
6,3,104,0,5


# Load data

In [7]:
train_df = pd.read_parquet("../data/train_features.parquet")
val_df = pd.read_parquet("../data/val_features.parquet")

In [8]:
assert (val_df[args.timestamp_col].min() - train_df[args.timestamp_col].max()) > 0
val_timestamp = train_df[args.timestamp_col].max() + 1
logger.info(f"{val_timestamp=}")

[32m2025-03-08 20:32:06.724[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mval_timestamp=1628641686215[0m


In [9]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AE224PFXAEAT66IXX43GRJSWHXCA,0399159312,2.0,1373291889000,6822,4732,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
1,AE224PFXAEAT66IXX43GRJSWHXCA,B000FA5TTW,1.0,1382077065000,6822,1581,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
2,AE224PFXAEAT66IXX43GRJSWHXCA,030758836X,1.0,1424138603000,6822,2712,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
3,AE224PFXAEAT66IXX43GRJSWHXCA,B00MSRW6SM,4.0,1437924147000,6822,4217,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 473..."
4,AE224PFXAEAT66IXX43GRJSWHXCA,B00A18VD7A,1.0,1464603674000,6822,6558,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4732.0, 1..."
...,...,...,...,...,...,...,...
3565,AHZLM4RDKSICEFEAYEQQRZW45BPA,B08CV9SPDQ,5.0,1657516186943,14008,1406,"[-1, -1, -1, -1, 6183, 7352, 3278, 4810, 2058,..."
3566,AHZLQPSPG675BABC5R5NJW6KG3WQ,B07D6PZ6P1,5.0,1635813519329,13829,6282,"[-1, -1, -1, -1, -1, 5323, 1630, 1844, 230, 5178]"
3567,AHZNQ34GWKKLJN53IDXLAX22OBJQ,B07ZJ2VHBB,5.0,1652978096579,17280,4442,"[-1, -1, -1, -1, -1, 249, 1297, 3552, 4954, 2316]"
3568,AHZNQ34GWKKLJN53IDXLAX22OBJQ,B00PG8UCGS,5.0,1654707732874,17280,5357,"[-1, -1, -1, -1, 249, 1297, 3552, 4954, 2316, ..."


In [10]:
features = ["item_sequence", "user_id"]

neg_df = generate_negative_samples(
    full_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    neg_label=0,
    seed=args.random_seed,
)
neg_ts_df = add_features_to_neg_df(
    full_df, neg_df, "user_indice", args.timestamp_col, features
)
neg_ts_df

Generating Negative Samples:   0%|          | 0/19734 [00:00<?, ?it/s]

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id
0,0,1900,0,1420772333000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGQ2HV7AUJUPQ4ONNIJE5A7LZG2Q
1,0,348,0,1458435086000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGQ2HV7AUJUPQ4ONNIJE5A7LZG2Q
2,0,5001,0,1521759867017,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGQ2HV7AUJUPQ4ONNIJE5A7LZG2Q
3,0,331,0,1526603642593,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 734...",AGQ2HV7AUJUPQ4ONNIJE5A7LZG2Q
4,0,924,0,1528677119426,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 7344.0, 5...",AGQ2HV7AUJUPQ4ONNIJE5A7LZG2Q
...,...,...,...,...,...,...
197989,19733,2881,0,1508781759311,"[1603.0, 3279.0, 6477.0, 4321.0, 6711.0, 39.0,...",AHAIHRJ3ICMXVFIBUWYQGVE2564Q
197990,19733,3268,0,1514090806196,"[3279.0, 6477.0, 4321.0, 6711.0, 39.0, 1790.0,...",AHAIHRJ3ICMXVFIBUWYQGVE2564Q
197991,19733,2524,0,1529342700605,"[6477.0, 4321.0, 6711.0, 39.0, 1790.0, 4012.0,...",AHAIHRJ3ICMXVFIBUWYQGVE2564Q
197992,19733,4487,0,1529686029035,"[4321.0, 6711.0, 39.0, 1790.0, 4012.0, 4529.0,...",AHAIHRJ3ICMXVFIBUWYQGVE2564Q


# Join with features

Populate the feature values for the newly created negative samples

In [11]:
# Item features

not_item_feature_cols = (
    args.user_col,
    "user_indice",
    "item_indice",
    args.rating_col,
    args.timestamp_col,
    *features,
)
item_features = [col for col in full_df.columns if col not in not_item_feature_cols]
item_features

['parent_asin']

In [12]:
neg_ts_df = pd.merge(
    neg_ts_df,
    full_df[["item_indice", *item_features]].drop_duplicates(subset=["item_indice"]),
    how="left",
    on=["item_indice"],
    validate="m:1",
)
neg_ts_df

Unnamed: 0,user_indice,item_indice,rating,timestamp,item_sequence,user_id,parent_asin
0,0,1900,0,1420772333000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGQ2HV7AUJUPQ4ONNIJE5A7LZG2Q,B017IKQW56
1,0,348,0,1458435086000,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGQ2HV7AUJUPQ4ONNIJE5A7LZG2Q,B000UZQHWU
2,0,5001,0,1521759867017,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",AGQ2HV7AUJUPQ4ONNIJE5A7LZG2Q,B01208O00U
3,0,331,0,1526603642593,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 734...",AGQ2HV7AUJUPQ4ONNIJE5A7LZG2Q,1501154656
4,0,924,0,1528677119426,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 7344.0, 5...",AGQ2HV7AUJUPQ4ONNIJE5A7LZG2Q,B083P1J5QX
...,...,...,...,...,...,...,...
197989,19733,2881,0,1508781759311,"[1603.0, 3279.0, 6477.0, 4321.0, 6711.0, 39.0,...",AHAIHRJ3ICMXVFIBUWYQGVE2564Q,B005OCYR1C
197990,19733,3268,0,1514090806196,"[3279.0, 6477.0, 4321.0, 6711.0, 39.0, 1790.0,...",AHAIHRJ3ICMXVFIBUWYQGVE2564Q,0812550307
197991,19733,2524,0,1529342700605,"[6477.0, 4321.0, 6711.0, 39.0, 1790.0, 4012.0,...",AHAIHRJ3ICMXVFIBUWYQGVE2564Q,1501115073
197992,19733,4487,0,1529686029035,"[4321.0, 6711.0, 39.0, 1790.0, 4012.0, 4529.0,...",AHAIHRJ3ICMXVFIBUWYQGVE2564Q,B013YQA2C2


In [13]:
item = neg_ts_df.sample(n=1)[args.item_col].values[0]
logger.info(f"Testing mapping item_indice and {args.item_col} for item {item}...")
neg_item_indices = neg_ts_df.loc[lambda df: df[args.item_col].eq(item)]["item_indice"]
assert (
    len(set(neg_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice in new neg_ts_df"
original_item_indices = full_df.loc[lambda df: df[args.item_col].eq(item)][
    "item_indice"
]
assert (
    len(set(original_item_indices)) == 1
), f"Mismatch {args.item_col} and item_indice at original df"
assert original_item_indices.iloc[0] == neg_item_indices.iloc[0]

[32m2025-03-08 20:32:18.985[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTesting mapping item_indice and parent_asin for item B07BDF7WKP...[0m


# Concating positive data with negative samples

In [14]:
full_df = pd.concat([full_df, neg_ts_df], axis=0).sample(
    frac=1, replace=False, random_state=args.random_seed
)

In [15]:
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
34842,AH377TAQBGVIUD75XIOSNRRBUMSA,B000OIZSLE,0.0,1603811722410,3520,5441,"[-1.0, -1.0, -1.0, -1.0, 4238.0, 3266.0, 4316...."
130704,AGPAMWVXLFFWCBO3J4JNOYBB5CJQ,B01CXE9Q8C,4.0,1520074315327,6133,2684,"[-1.0, -1.0, -1.0, -1.0, -1.0, 2863.0, 5665.0,..."
155807,AGSXDUEY3XZJVJSRBQCTAPOSY2NA,1451681755,0.0,1446855213000,15471,3848,"[-1.0, -1.0, -1.0, 2423.0, 5224.0, 6745.0, 360..."
17264,AECGNMOCHNIEKROWI6NCZQE7QV3A,B001MSMULG,0.0,1412051625000,1717,4696,"[241.0, 7334.0, 3433.0, 7247.0, 1840.0, 5215.0..."
131576,AHJ4X46OBBFQFQEGKO6CYQSL7A6Q,0441016995,0.0,1540233594357,13004,4750,"[3132.0, 4886.0, 3905.0, 2187.0, 3951.0, 6240...."
...,...,...,...,...,...,...,...
52966,AEEAI7QJ6HFCN43V543MOTKNBQOA,B0043M6L22,0.0,1528288422975,5335,7093,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4528.0, 5..."
19064,AGMWE3EQOAKN467EMLZFXS5FD7FQ,0525577947,0.0,1364849772000,1895,5383,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 5802.0, 6..."
125474,AGBD2QKG2VULRDA4OKNJZUII44JA,B00UXX5BAS,0.0,1313415114000,12423,4075,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 287..."
132003,AGQ3G5TPEQV5AF4UPHKKCPK4C27Q,B00AEDDSZW,4.0,1381858732000,10563,1139,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


In [16]:
key_cols = [
    args.user_col,
    args.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    args.rating_col,
    args.timestamp_col,
]
assert full_df[key_cols].isna().sum().sum() == 0, "Null values found at key colums"

In [17]:
val_timestamp

1628641686215

# Split back train test

In [18]:
train_neg_df = full_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]

In [19]:
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
34842,AH377TAQBGVIUD75XIOSNRRBUMSA,B000OIZSLE,0.0,1603811722410,3520,5441,"[-1.0, -1.0, -1.0, -1.0, 4238.0, 3266.0, 4316...."
130704,AGPAMWVXLFFWCBO3J4JNOYBB5CJQ,B01CXE9Q8C,4.0,1520074315327,6133,2684,"[-1.0, -1.0, -1.0, -1.0, -1.0, 2863.0, 5665.0,..."
155807,AGSXDUEY3XZJVJSRBQCTAPOSY2NA,1451681755,0.0,1446855213000,15471,3848,"[-1.0, -1.0, -1.0, 2423.0, 5224.0, 6745.0, 360..."
17264,AECGNMOCHNIEKROWI6NCZQE7QV3A,B001MSMULG,0.0,1412051625000,1717,4696,"[241.0, 7334.0, 3433.0, 7247.0, 1840.0, 5215.0..."
131576,AHJ4X46OBBFQFQEGKO6CYQSL7A6Q,0441016995,0.0,1540233594357,13004,4750,"[3132.0, 4886.0, 3905.0, 2187.0, 3951.0, 6240...."
...,...,...,...,...,...,...,...
52966,AEEAI7QJ6HFCN43V543MOTKNBQOA,B0043M6L22,0.0,1528288422975,5335,7093,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 4528.0, 5..."
19064,AGMWE3EQOAKN467EMLZFXS5FD7FQ,0525577947,0.0,1364849772000,1895,5383,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 5802.0, 6..."
125474,AGBD2QKG2VULRDA4OKNJZUII44JA,B00UXX5BAS,0.0,1313415114000,12423,4075,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 287..."
132003,AGQ3G5TPEQV5AF4UPHKKCPK4C27Q,B00AEDDSZW,4.0,1381858732000,10563,1139,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."


In [20]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
0,AE23RLRV25THT7OZM4T4ZJ4BMYCA,0062409212,5.0,1646772001708,7581,6271,"[-1, -1, -1, -1, -1, 7353, 4162, 2974, 6055, 3..."
460,AEKQREM4SYKU6HK2CSMWOYXNMCWQ,B00EA8EO00,5.0,1650134360640,7563,1049,"[-1, -1, -1, -1, 5434, 4443, 6758, 3012, 638, ..."
14585,AFGTSYTWRYDCEA7RIDXTBXVOOFFQ,B0141B48W4,0.0,1635269638002,1449,3058,"[4666, 466, 1728, 4963, 4653, 3053, 4442, 3927..."
71,AE4YFHNIXVBWI2V4DRU5UFSTY4RQ,B08ZM7BQ5J,4.0,1630002488344,674,3068,"[7119, 6518, 1870, 346, 6276, 3151, 1624, 42, ..."
2079,AGILJFVEHJX5W3Q7QYJZ7MRVU6ZQ,B093GVNHQV,5.0,1641012367793,14381,1286,"[6042, 1683, 6986, 1244, 4861, 6332, 4637, 768..."
...,...,...,...,...,...,...,...
325,AEFQQOMB5AYGR2FVV6X5OP5Y7VTQ,B00KIZQG96,5.0,1657081631273,19644,1537,"[6287, 6371, 184, 465, 7223, 5882, 3145, 6059,..."
187725,AH4ARLONPDSCFEVCUQZK6Z5EF72Q,0763655988,0.0,1657136765901,18673,4934,"[3925, 6168, 2698, 4219, 2720, 7090, 2984, 659..."
2214,AGKILHN37242OQLPSMAMMZJA6IAQ,B07CRC52VH,3.0,1653587733450,3136,1407,"[733, 6782, 1923, 4932, 2369, 4831, 5227, 4573..."
165031,AHVI6MSMQ543OPJLYDQEHCUTGAEA,1594484465,0.0,1634219127106,16414,4627,"[4836, 3373, 536, 6701, 3047, 2679, 5048, 5620..."


# Checks

In [21]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
assert (
    check_df[args.rating_col].gt(0).sum() == check_df.shape[0] / 2
), "Number of pos and neg samples are not equal"

[32m2025-03-08 20:32:19.171[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AGUSIGQKZO6IIDEQNM4U7GHFTBWA...[0m


In [22]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert (
    train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5
), f"Item {item} does not appear much in training data"

[32m2025-03-08 20:32:19.185[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B07P32NGR6...[0m


## Random eye-ball

In [23]:
check_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
47923,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B07P1SSF9S,0.0,1441469202000,4810,167,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
139174,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B00L9B7IKE,5.0,1441469202000,4810,3468,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
139175,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B00IB5BSBG,5.0,1444340727000,4810,6801,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
47924,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B01L1CEZ6K,0.0,1444340727000,4810,4203,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
47925,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B01GYPY7VC,0.0,1459773506000,4810,432,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
139176,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B0141ZP33S,4.0,1459773506000,4810,2989,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
47926,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,0385349580,0.0,1504998515783,4810,6029,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 346..."
139177,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B01MQ2HAAJ,4.0,1504998515783,4810,4523,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 346..."
139178,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B077YQH18P,5.0,1537217539409,4810,1013,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3468.0, 6..."
47927,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B000FC1MM8,0.0,1537217539409,4810,5265,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 3468.0, 6..."


In [27]:
with pd.option_context("display.max_colwidth", None):
    display(val_check_df)

Unnamed: 0,user_id,parent_asin,rating,timestamp,user_indice,item_indice,item_sequence
47930,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B000FBFM3E,0.0,1630015006067,4810,1771,"[-1, -1, -1, 3468, 6801, 2989, 4523, 1013, 172, 806]"
2536,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B07P32NGR6,4.0,1630015006067,4810,590,"[-1, -1, -1, 3468, 6801, 2989, 4523, 1013, 172, 806]"
2537,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B08DFSR14S,5.0,1644731384019,4810,5841,"[-1, -1, 3468, 6801, 2989, 4523, 1013, 172, 806, 590]"
47931,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B007TJ55YG,0.0,1644731384019,4810,2622,"[-1, -1, 3468, 6801, 2989, 4523, 1013, 172, 806, 590]"
47932,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,0375706852,0.0,1651288792553,4810,3091,"[-1, 3468, 6801, 2989, 4523, 1013, 172, 806, 590, 5841]"
2538,AGUSIGQKZO6IIDEQNM4U7GHFTBWA,B07HQ5S1NW,5.0,1651288792553,4810,6752,"[-1, 3468, 6801, 2989, 4523, 1013, 172, 806, 590, 5841]"


# Persist

In [25]:
full_df.to_parquet("../data/full_features_neg_sampling_df.parquet", index=False)

In [26]:
train_neg_df.to_parquet("../data/train_features_neg_df.parquet", index=False)
val_neg_df.to_parquet("../data/val_features_neg_df.parquet", index=False)