In [1]:
import os

import numpy as np
import pandas as pd
import re

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, ndcg_score
import lightgbm as lgbm

pd.options.mode.chained_assignment = None


# XXX: use smoothed labels?, use scaled scoring?
SCORE_MAP = {"Exact": 3,  # exact
           "Substitute": 2,  # substitute
           "Complement": 1,  # complementary
           "Irrelevant": 0}  # irrelevant


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess_text(txt: str) -> str:
    if txt == "":
        return txt
    txt = txt.lower()
    txt = re.sub(r"[^a-z0-9\s]", "", txt)  # remove punc
    return txt.strip()

In [3]:
# work on test samples
cols = ["query", "product_title", "product_description", "product_brand", "product_color"]  # product_text is already a combined version
df_folder = "formatted_esci"
train_filenames = [f for f in os.listdir(df_folder) if f.startswith("train")]
test_filenames = [f for f in os.listdir(df_folder) if f.startswith("test")]


In [4]:
def load_df(filenames: str) -> pd.DataFrame:
    dfs = []
    for f in filenames:
        df = pd.read_parquet(f"{df_folder}/{f}")
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [5]:
df = load_df(train_filenames[:])
df_test = load_df(test_filenames[:])

len(df[df.columns[0]]), len(df_test[df_test.columns[0]])

(2027874, 652490)

In [6]:
for col in cols:
    df[col] = df[col].fillna("").apply(preprocess_text)

df["combined"] = df[cols].apply(lambda r: " ".join(r), axis=1)
df["labels"] = df["esci_label"].map(SCORE_MAP)


In [7]:
# feature ext.

# 1. either tfidf
vec = TfidfVectorizer(max_features=8000)
x = vec.fit_transform(df["combined"])

# 2. or transformer-based embedding
# transformer = SentenceTransformer("CHOOSE ONE HERE")
# x = transformer.encode(df["combined"].tolist(), convert_to_numpy=True)

In [8]:
group_kfold = GroupKFold(n_splits=5)
folds = list(group_kfold.split(x, df["labels"], df["query_id"]))

In [9]:
def get_group_sizes(df_subset):
    """ lgbm requires this """
    return df_subset.groupby('query_id').size().tolist()

In [10]:
len(df[df.columns[0]])

2027874

In [11]:
val_idx = folds[0][1]

train_idx = np.concatenate(folds[1:][0])

In [12]:
train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)

x_train = x[train_idx]
x_val = x[val_idx]


In [13]:
groups_train = get_group_sizes(train_df)
groups_val = get_group_sizes(val_df)

In [14]:
lgbm_train_set = lgbm.Dataset(x_train, train_df["labels"], group=groups_train)
lgbm_val_set = lgbm.Dataset(x_val, val_df["labels"], group=groups_val)

lgbm_model_params = {
    "objective": "lambdarank",
    "metric": "ndcg",
    "learning_rate": 0.1,
    "verbose": -1,
}
results_dict = {}

In [15]:
def formatted_ndcg(preds, eval_data):
    return "ndcg", ndcg_score(eval_data.get_label(), preds), True

In [41]:
model = lgbm.train(lgbm_model_params, lgbm_train_set, valid_sets=[lgbm_val_set], num_boost_round=1000)


In [42]:
model.best_iteration

0

In [43]:
def calculate_ndcg(df, preds, query_group_col="query_id"):
    scores = []
    for qid, group in df.groupby(query_group_col):
        t = group["labels"].values.reshape(1, -1)
        p = preds[group.index].reshape(1, -1)
        
        # single result?
        if t.shape[1] > 1:
            scores.append(ndcg_score(t, p, k=3))
    return np.mean(scores) if scores else 0.0
        

In [44]:
preds = model.predict(x_val, model.best_iteration)

In [45]:
test_f1 = f1_score(val_df["labels"], np.round(preds), average="macro")
test_acc = accuracy_score(val_df["labels"], np.round(preds))
test_precision = precision_score(val_df["labels"], np.round(preds), average="macro")
test_recall = recall_score(val_df["labels"], np.round(preds), average="macro")

test_f1, test_acc, test_precision, test_recall

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.023205346216754758,
 0.0722107022834135,
 0.013996284619595916,
 0.108182176983583)

In [46]:
test_ndcg = calculate_ndcg(val_df, preds)
print(test_ndcg)

0.8935062465710224
