In [None]:
import os
from copy import deepcopy
import joblib

import nltk
import numpy as np
import pandas as pd
import lightgbm as lgbm 
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ndcg_score

from utils.load import load_df
from utils.preprocess import scoring_function, cosine_sim, preprocess_df

pd.options.mode.chained_assignment = None


# XXX: use smoothed labels?, use scaled scoring?
SCORE_MAP = {"E": 3.0,  # exact
           "S": 2.0,  # substitute
           "C": 1.0,  # complementary
           "I": 0.0}  # irrelevant


In [2]:
def pipeline(df: pd.DataFrame
             )-> lgbm.LGBMRanker:
    df = preprocess_df(df, columns=["query", "product_description", "product_title", "product_brand"]).copy()
    print(df.head())
    df["relevance"] = df["esci_label"].apply(scoring_function,
                                                    args=(SCORE_MAP,))

    vectorizer = TfidfVectorizer(max_features=100)
    df["query_embed"] = list(vectorizer.fit_transform(df['query']).toarray())
    df["title_embed"] = list(vectorizer.transform(df["product_title"]).toarray())

    print(df.head)
    
    df["query_title_sim"] = df.apply(
        cosine_sim, axis=1
    )
    
    df["query_len"] = df["query"].map(lambda x: len(x.split()))
    df["title_len"] = df["product_title"].map(lambda x: len(x.split()))
    df["brand_labelenc"] = LabelEncoder().fit_transform(df["product_brand"].fillna("unknown"))

    features = ['query_title_sim', 'query_len', 'title_len', 'brand_labelenc']
    x = df[features]
    y = df['relevance']

    train_mask = df['split'] == 'train'
    val_mask = df['split'] == 'val'
    test_mask = df['split'] == 'test'

    x_train, y_train = x[train_mask], y[train_mask]
    x_val, y_val = x[val_mask], y[val_mask]
    x_test, y_test = x[test_mask], y[test_mask]

    train_groups = df[train_mask].groupby("query_id").size().to_list()
    val_groups = df[val_mask].groupby("query_id").size().to_list()

    lgbm_ranker = lgbm.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="gbdt",
    n_estimators=200,
    learning_rate=0.05
    )
    
    lgbm_ranker.fit(x_train, y_train,
                    eval_metric="ndcg",
                    eval_set=[(x_val, y_val)],
                    group=train_groups,
                    eval_group=[val_groups])
    
    y_test_pred = lgbm_ranker.predict(x_test)
    ndcg_score_test = ndcg_score(y_test, y_test_pred)
    print("NDCG on test: ", ndcg_score_test)

    joblib.dump(lgbm_ranker, "ranker_model.pkl")
    return lgbm_ranker

In [3]:

def main(**kwargs):
    ROOT = os.path.join(os.getcwd(), "shopping_queries")

    examples = os.path.join(ROOT, "dataset_examples.parquet")
    products = os.path.join(ROOT, "dataset_products.parquet")
    """ input: paths to  dfs """
    df_examples = load_df(examples, "parquet")
    df_products = load_df(products, "parquet")
    # df_sources = load_df(kwargs["src"], "csv")
    
    # join examples and products on product_id
    df_joined = pd.merge(df_examples,
                         df_products,
                         how="left",
                         left_on=["product_locale", "product_id"],
                         right_on=["product_locale", "product_id"])
    
    # print(df_joined.columns)

    df_joined_small = df_joined[df_joined["small_version"] == 1]
    df_joined_small = df_joined_small[df_joined_small["product_locale"] == "us"]
    
    print(df_joined_small.columns)
    # df_joined_small = df_joined_small.drop(["product_bullet_point", "product_color"])
    # print(df_joined_small.head())

    # print(df_joined_small.sample(1))

    pipeline(df_joined_small[:100])


In [4]:
# can be dropped after process:
# 1. locale: us, 
# 2. small_version: 1,
# 3. large_version: 0,
# 4. split: train, val, test
# can be dropped instant cus not sure about its effect on the data
# product_bullet_point, product_color
# What is left:::
# query, product_description, product_title, product_brand, esci_label
# query, 3 features, label-relevancy

main()  

Index(['example_id', 'query', 'query_id', 'product_id', 'product_locale',
       'esci_label', 'small_version', 'large_version', 'split',
       'product_title', 'product_description', 'product_bullet_point',
       'product_brand', 'product_color'],
      dtype='object')
!awnmower tires without rims
!awnmower tires without rims
!awnmower tires without rims


NameError: name 'quit' is not defined