In [1]:
import os

import numpy as np
import pandas as pd
import re

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, ndcg_score
import lightgbm as lgbm

pd.options.mode.chained_assignment = None


# XXX: use smoothed labels?, use scaled scoring?
SCORE_MAP = {"Exact": 3.0,  # exact
           "Substitute": 2.0,  # substitute
           "Complement": 1.0,  # complementary
           "Irrelevant": 0.0}  # irrelevant


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
def preprocess_text(txt: str) -> str:
    if txt == "":
        return txt
    txt = txt.lower()
    txt = re.sub(r"[^a-z0-9\s]", "", txt)  # remove punc
    return txt.strip()

In [None]:
# work on test samples
cols = ["query", "product_title", "product_description", "product_brand", "product_color"]  # product_text is already a combined version
df = pd.read_parquet("formatted_esci/train-00000-of-00011.parquet")

In [16]:
for col in cols:
    df[col] = df[col].fillna("").apply(preprocess_text)

df["combined"] = df[cols].apply(lambda r: " ".join(r), axis=1)
df["labels"] = df["esci_label"].map(SCORE_MAP)


In [None]:
# feature ext.

# 1. either tfidf
vec = TfidfVectorizer(max_features=5000)
x = vec.fit_transform(df["combined"])

# 2. or transformer-based embedding
# transformer = SentenceTransformer("CHOOSE ONE HERE")
# x = transformer.encode(df["combined"].tolist(), convert_to_numpy=True)

In [18]:
group_kfold = GroupKFold(n_splits=5)
folds = list(group_kfold.split(x, df["labels"], df["query_id"]))

In [19]:
def get_group_sizes(df_subset):
    """ lgbm requires this """
    return df_subset.groupby('query_id').size().tolist()

In [22]:
test_idx = folds[0][1]
val_idx = folds[1][1]

train_idx = np.concatenate(folds[2:][0])

In [24]:
train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)
test_df = df.iloc[test_idx].reset_index(drop=True)

x_train = x[train_idx]
x_val = x[val_idx]
x_test = x[test_idx]


In [25]:
groups_train = get_group_sizes(train_df)
groups_val = get_group_sizes(val_df)

In [28]:
lgbm_train_set = lgbm.Dataset(x_train, train_df["labels"], group=groups_train)
lgbm_val_set = lgbm.Dataset(x_val, val_df["labels"], group=groups_val)

lgbm_model_params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "learning_rate": 0.1,
    "verbose": -1,
}
results_dict = {}

In [34]:
def formatted_ndcg(preds, eval_data):
    return "ndcg", ndcg_score(eval_data.get_label(), preds), True

In [37]:
model = lgbm.train(lgbm_model_params, lgbm_train_set, valid_sets=[lgbm_val_set], num_boost_round=1000)


In [39]:
model.best_iteration

0

In [41]:
def calculate_ndcg(df, preds, query_group_col="query_id"):
    scores = []
    for qid, group in df.groupby(query_group_col):
        t = group["labels"].values.reshape(1, -1)
        p = preds[group.index].reshape(1, -1)
        
        # single result?
        if t.shape[1] > 1:
            scores.append(ndcg_score(t, p))
    return np.mean(scores) if scores else 0.0
        

In [42]:
preds = model.predict(x_test, model.best_iteration)

In [43]:
test_ndcg = calculate_ndcg(test_df, preds)
print(test_ndcg)

0.5702137487843737


In [None]:
# can be dropped after process:
# 1. locale: us, 
# 2. small_version: 1,
# 3. large_version: 0,
# 4. split: train, val, test
# can be dropped instant cus not sure about its effect on the data
# product_bullet_point, product_color
# What is left:::
# query, product_description, product_title, product_brand, esci_label
# query, 3 features, label-relevancy


Index(['example_id', 'query', 'query_id', 'product_id', 'product_locale',
       'esci_label', 'small_version', 'large_version', 'split',
       'product_title', 'product_description', 'product_bullet_point',
       'product_brand', 'product_color'],
      dtype='object')
!awnmower tires without rims
!awnmower tires without rims
!awnmower tires without rims


NameError: name 'quit' is not defined