In [1]:
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers import evaluation
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [2]:
! ls /Users/rdubey/mysrc/random-stuff/esci-data/shopping_queries_dataset/

shopping_queries_dataset_examples.parquet
shopping_queries_dataset_products.parquet
shopping_queries_dataset_sources.csv


In [3]:
dataset_path = '/Users/rdubey/mysrc/random-stuff/esci-data/shopping_queries_dataset/'

In [4]:
# !ls ../esci-data/shopping_queries_dataset

In [5]:
# ! pwd

In [6]:
""" 0. Init variables """
col_query = "query"
col_query_id = "query_id"
col_product_id = "product_id" 
col_product_title = "product_title"
col_product_locale = "product_locale"
col_esci_label = "esci_label" 
col_product_color = "product_color"
col_product_description = "product_description"
col_product_bullet_point = "product_bullet_point"
col_small_version = "small_version"
col_split = "split"
col_gain = 'gain'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
esci_label2gain = {
    'E' : 1.0,
    'S' : 0.1,
    'C' : 0.01,
    'I' : 0.0,
}

In [7]:
os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet')

'/Users/rdubey/mysrc/random-stuff/esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet'

In [8]:
# ! ls /Users/rdubey/mysrc/random-stuff/esci-data/shopping_queries_dataset/


    shopping_queries_dataset_examples.parquet contains the following columns : example_id, query, query_id, product_id, product_locale, esci_label, small_version, large_version, split
    shopping_queries_dataset_products.parquet contains the following columns : product_id, product_title, product_description, product_bullet_point, product_brand, product_color, product_locale
    shopping_queries_dataset_sources.csv contains the following columns : query_id, source


In [None]:
# columns = 'example_id, query, query_id, product_id, product_locale, esci_label, small_version, large_version, split'
# columns.split(",")

In [9]:
""" 1. Load data """    
dataset_path = "/Users/rdubey/mysrc/random-stuff/esci-data/shopping_queries_dataset"
n_dev_queries = 200 # default from the script
df_examples = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_products.parquet'))

In [10]:
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=[col_product_locale, col_product_id],
    right_on=[col_product_locale, col_product_id]
)

In [11]:
df_examples_products.head().T

Unnamed: 0,0,1,2,3,4
example_id,0,1,2,3,4
query,revent 80 cfm,revent 80 cfm,revent 80 cfm,revent 80 cfm,revent 80 cfm
query_id,0,0,0,0,0
product_id,B000MOO21W,B07X3Y6B1V,B07WDM7MQQ,B07RH6Z8KW,B07QJ7WYFQ
product_locale,us,us,us,us,us
esci_label,I,E,E,E,E
small_version,0,0,0,0,0
large_version,1,1,1,1,1
split,train,train,train,train,train
product_title,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...,Homewerks 7141-80 Bathroom Fan Integrated LED ...,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,Delta Electronics RAD80L BreezRadiance 80 CFM ...,Panasonic FV-08VRE2 Ventilation Fan with Reces...


In [12]:
locale = 'us'
df_examples_products = df_examples_products[df_examples_products[col_small_version] == 1]
df_examples_products = df_examples_products[df_examples_products[col_split] == "train"]
df_examples_products = df_examples_products[df_examples_products[col_product_locale] == locale]
df_examples_products[col_gain] = df_examples_products[col_esci_label].apply(lambda esci_label: esci_label2gain[esci_label])

In [13]:
import gc
del df_examples
del df_products
gc.collect()

925

In [14]:
random_state = 42
list_query_id = df_examples_products[col_query_id].unique()
dev_size = n_dev_queries / len(list_query_id)
list_query_id_train, list_query_id_dev = train_test_split(list_query_id, test_size=dev_size, random_state=random_state)

In [15]:
df_examples_products = df_examples_products[
    [col_query_id, col_query, col_product_title, col_product_description, col_product_bullet_point, col_gain]
]
df_train = df_examples_products[df_examples_products[col_query_id].isin(list_query_id_train)]
df_dev = df_examples_products[df_examples_products[col_query_id].isin(list_query_id_dev)]

In [16]:
df_train.head()

Unnamed: 0,query_id,query,product_title,product_description,product_bullet_point,gain
16,1,!awnmower tires without rims,"RamPro 10"" All Purpose Utility Air Tires/Wheel...","<b>About The Ram-Pro All Purpose Utility 10"" A...",✓ The Ram-Pro Ten Inch ready to install Air Ti...,0.0
17,1,!awnmower tires without rims,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,Please check your existing tire Sidewall for t...,1.0
18,1,!awnmower tires without rims,NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...,,[QUALITY]: Hardened Steel-Iron construction wi...,0.0
19,1,!awnmower tires without rims,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,"Tire Size: 13 x 5.00 - 6 Axle: 3/4"" inside dia...",,0.1
20,1,!awnmower tires without rims,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,No fuss. Just take off your old assembly and r...,Tire size:15x6.00-6 Ply: 4 Tubeless\n6x4.5 Whe...,1.0


In [20]:
pd.set_option('display.max_colwidth', None)

In [21]:
df_train['product_bullet_point']

16                                                                                                                                                                              ✓ The Ram-Pro Ten Inch ready to install Air Tire Wheels will be the best replacement for your hand truck (or anything similar) tires and wheels with easy installation and great performance.\n✓ SPECS: The 2 Ram-Pro Air Tires sizes are: High: 10" - Wide: 3" - Hole Diameter: 5/8" - Hub Depth: 1-3/4" with Double Sealed Bearings. Load Capacity: 300 lbs. (136 KG) - P.S.I. Rating: 30 per tire - Tube type: 2 ply 4.10/3.5-4.\n✓ QUALITY: The high quality heavy-duty rubber will last very long. The Air stem is on the outside so you can always inflate the tires easy if needed - The Double Sealed Bearings will evenly distribute your loud on your vehicle.\n✓ DESIGN: This air filled tires are designed with sporty look raised grips to drive the smoothest possible, and is the 4 bolt hole’s double rim tube type design for best perf

In [22]:
device = "mps"

In [23]:
""" 2. Prepare data loaders """
train_batch_size = 128
train_samples = []
model_save_path = f"./models/task_1_ranking_model_product_texts/{locale}"
for (_, row) in df_train.iterrows():
    query = row[col_query]
    document = " ".join([
        row[col_product_title], 
        row[col_product_bullet_point] or " ",
        row[col_product_description] or " ",
    ])
    train_samples.append(InputExample(texts=[query, document], label=float(row[col_gain])))
    
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)

In [25]:
del df_examples_products
gc.collect()

502

In [26]:
dev_samples = {}
query2id = {}
for (_, row) in df_dev.iterrows():
    try:
        qid = query2id[row[col_query]]
    except KeyError:
        qid = len(query2id)
        query2id[row[col_query]] = qid
    if qid not in dev_samples:
        dev_samples[qid] = {'query': row[col_query], 'positive': set(), 'negative': set()}
    if row[col_gain] > 0:
        dev_samples[qid]['positive'].add(row[col_product_title])
    else:
        dev_samples[qid]['negative'].add(row[col_product_title])
evaluator = CERerankingEvaluator(dev_samples, name='train-eval')


In [None]:

""" 3. Prepare Cross-enconder model:
    https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_cross-encoder_kd.py
"""
model_name = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
num_epochs = 1
num_labels = 1
max_length = 512
default_activation_function = torch.nn.Identity()
model = CrossEncoder(
    model_name, 
    num_labels=num_labels, 
    max_length=max_length, 
    default_activation_function=default_activation_function, 
    device=device
)
loss_fct=torch.nn.MSELoss()
evaluation_steps = 5000
warmup_steps = 5000
lr = 7e-6
""" 4. Train Cross-encoder model """
model.fit(
    train_dataloader=train_dataloader,
    loss_fct=loss_fct,
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=evaluation_steps,
    warmup_steps=warmup_steps,
    output_path=f"{model_save_path}_tmp",
    optimizer_params={'lr': lr},
)
model.save(model_save_path)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3247 [00:00<?, ?it/s]

In [None]:
print("done")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from tqdm import tqdm
import numpy as np

In [None]:
 """ 0. Init variables """
col_query = "query"
col_query_id = "query_id"
col_product_id = "product_id" 
col_product_title = "product_title"
col_product_locale = "product_locale"
col_small_version = "small_version"
col_split = "split"
col_scores = "scores"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

""" 1. Load data """    
df_examples = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_products.parquet'))

df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=[col_product_locale, col_product_id],
    right_on=[col_product_locale, col_product_id]
)
df_examples_products = df_examples_products[df_examples_products[col_small_version] == 1]
df_examples_products = df_examples_products[df_examples_products[col_split] == "test"]
df_examples_products = df_examples_products[df_examples_products[col_product_locale] == locale]

features_query = df_examples_products[col_query].to_list()
features_product = df_examples_products[col_product_title].to_list()
n_examples = len(features_query)
scores = np.zeros(n_examples)

In [None]:
model_path = model_save_path
batch_size=256
if locale == "us":
    """ 2. Prepare Cross-encoder model """
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    """ 3. Generate hypothesis """
    model.eval()
    with torch.no_grad():
        for i in tqdm(range(0, n_examples, batch_size)):
            j = min(i + batch_size, n_examples)
            features_query_ = features_query[i:j]
            features_product_ = features_product[i:j]
            features = tokenizer(features_query_, features_product_,  padding=True, truncation=True, return_tensors="pt").to(device)
            scores[i:j] = np.squeeze(model(**features).logits.cpu().detach().numpy())
            i = j

In [None]:
""" 4. Prepare hypothesis file """   
HYPOTHESIS_TASK1_PATH="./hypothesis"
HYPOTHESIS_PATH_FILE=f"{HYPOTHESIS_TASK1_PATH}/task_1_ranking_model_baseline_{locale}.csv"

In [None]:
HYPOTHESIS_PATH_FILE

In [None]:
!mkdir -p {HYPOTHESIS_TASK1_PATH}

In [None]:

df_hypothesis = pd.DataFrame({
    col_query_id : df_examples_products[col_query_id].to_list(),
    col_product_id : df_examples_products[col_product_id].to_list(),
    col_scores : scores,
})
df_hypothesis = df_hypothesis.sort_values(by=[col_query_id, col_scores], ascending=False)
df_hypothesis[[col_query_id, col_product_id]].to_csv(
    HYPOTHESIS_PATH_FILE,
    index=False,
    sep=',',
)