In [1]:
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers import evaluation
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

In [2]:
!ls ../esci-data/shopping_queries_dataset

shopping_queries_dataset_examples.parquet
shopping_queries_dataset_products.parquet
shopping_queries_dataset_sources.csv


In [3]:
! pwd

/Users/rdubey/mysrc/random-stuff/nlp-transformers


In [4]:
dataset_path = "/Users/rdubey/mysrc/random-stuff/esci-data/shopping_queries_dataset"

In [5]:
""" 0. Init variables """
col_query = "query"
col_query_id = "query_id"
col_product_id = "product_id" 
col_product_title = "product_title"
col_product_locale = "product_locale"
col_esci_label = "esci_label" 
col_product_description = "product_description"
col_small_version = "small_version"
col_split = "split"
col_gain = 'gain'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
esci_label2gain = {
    'E' : 1.0,
    'S' : 0.1,
    'C' : 0.01,
    'I' : 0.0,
}

In [6]:
os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet')

'/Users/rdubey/mysrc/random-stuff/esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet'

In [7]:
! ls /Users/rdubey/mysrc/random-stuff/esci-data/shopping_queries_dataset/

shopping_queries_dataset_examples.parquet
shopping_queries_dataset_products.parquet
shopping_queries_dataset_sources.csv



    shopping_queries_dataset_examples.parquet contains the following columns : example_id, query, query_id, product_id, product_locale, esci_label, small_version, large_version, split
    shopping_queries_dataset_products.parquet contains the following columns : product_id, product_title, product_description, product_bullet_point, product_brand, product_color, product_locale
    shopping_queries_dataset_sources.csv contains the following columns : query_id, source


In [8]:
""" 1. Load data """    
dataset_path = "/Users/rdubey/mysrc/random-stuff/esci-data/shopping_queries_dataset"
n_dev_queries = 200 # default from the script
df_examples = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_products.parquet'))


In [9]:
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=[col_product_locale, col_product_id],
    right_on=[col_product_locale, col_product_id]
)

In [10]:
df_examples_products.head().T

Unnamed: 0,0,1,2,3,4
example_id,0,1,2,3,4
query,revent 80 cfm,revent 80 cfm,revent 80 cfm,revent 80 cfm,revent 80 cfm
query_id,0,0,0,0,0
product_id,B000MOO21W,B07X3Y6B1V,B07WDM7MQQ,B07RH6Z8KW,B07QJ7WYFQ
product_locale,us,us,us,us,us
esci_label,I,E,E,E,E
small_version,0,0,0,0,0
large_version,1,1,1,1,1
split,train,train,train,train,train
product_title,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...,Homewerks 7141-80 Bathroom Fan Integrated LED ...,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,Delta Electronics RAD80L BreezRadiance 80 CFM ...,Panasonic FV-08VRE2 Ventilation Fan with Reces...


In [11]:
locale = 'us'
df_examples_products = df_examples_products[df_examples_products[col_small_version] == 1]
df_examples_products = df_examples_products[df_examples_products[col_split] == "train"]
df_examples_products = df_examples_products[df_examples_products[col_product_locale] == locale]
df_examples_products[col_gain] = df_examples_products[col_esci_label].apply(lambda esci_label: esci_label2gain[esci_label])

In [12]:
import re
def clean_text_column(x):
    x = x.lower() # lowercase everything
    x = ' '.join(x.strip().splitlines())
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    x = re.sub(r'https*\S+', ' ', x) # remove links
    x = re.sub(r'http*\S+', ' ', x)

    # cleaning up text
    x = re.sub(r'\'\w+', '', x) 
    
    x = re.sub(r'\w*\d+\w*', '', x)
    
    x = re.sub(r'\s{2,}', ' ', x)
    
    x = re.sub(r'\s[^\w\s]\s', '', x)
#     print(x)
    x = remove_html_tags(x)
    return x

def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [13]:
df_examples_products_clean = df_examples_products.copy()
df_examples_products_clean['query'] = df_examples_products_clean['query'].apply(clean_text_column) 
df_examples_products_clean['product_title'] = df_examples_products_clean['product_title'].apply(clean_text_column) 

In [14]:
random_state = 42
list_query_id = df_examples_products_clean[col_query_id].unique()
dev_size = n_dev_queries / len(list_query_id)
list_query_id_train, list_query_id_dev = train_test_split(list_query_id, test_size=dev_size, random_state=random_state)

In [15]:
df_examples_products = df_examples_products_clean[[col_query_id, col_query, col_product_title, col_gain]]
df_train = df_examples_products[df_examples_products[col_query_id].isin(list_query_id_train)]
df_dev = df_examples_products[df_examples_products[col_query_id].isin(list_query_id_dev)]

In [16]:
df_train.head()

Unnamed: 0,query_id,query,product_title,gain
16,1,!awnmower tires without rims,ramproall purpose utility air tires/wheels wit...,0.0
17,1,!awnmower tires without rims,maxauto -pack .- turf mower tractor tire with ...,1.0
18,1,!awnmower tires without rims,neikoinch steel tire spoon lever iron tool kit...,0.0
19,1,!awnmower tires without rims,.-- turf mower tractor tire with gray rim,0.1
20,1,!awnmower tires without rims,"(set of.- husqvarna/poulan tire wheel assy ."" ...",1.0


In [17]:
df_train.query_id.unique()

array([     1,      5,      6, ..., 128570, 130537, 130539])

In [18]:
df_dev.head()

Unnamed: 0,query_id,query,product_title,gain
8836,297,- withnot hole,stylemafiafuel line fuel pressure barbed push ...,1.0
8837,297,- withnot hole,measureman fuel line fuel pressure adapter bra...,0.0
8838,297,- withnot hole,/ npt to fittingdegree female adapter (steel)k...,0.0
8839,297,- withnot hole,"mwmnun /"" npt port -/"" steam tube adapter top ...",0.0
8840,297,- withnot hole,mwmnunfuel line push lock fuel pressure barbed...,0.0


In [59]:
df_sample = df_train[df_train.query_id.isin([1,5,6])]
print(df_sample.query_id.unique())
df_sample.head()

[1 5 6]


Unnamed: 0,query_id,query,product_title,gain
16,1,!awnmower tires without rims,ramproall purpose utility air tires/wheels wit...,0.0
17,1,!awnmower tires without rims,maxauto -pack .- turf mower tractor tire with ...,1.0
18,1,!awnmower tires without rims,neikoinch steel tire spoon lever iron tool kit...,0.0
19,1,!awnmower tires without rims,.-- turf mower tractor tire with gray rim,0.1
20,1,!awnmower tires without rims,"(set of.- husqvarna/poulan tire wheel assy ."" ...",1.0


In [60]:
# df_sample[df_sample.query_id==1][[col_product_title, col_gain]]

In [62]:
from itertools import product
import itertools
def my_func(grouping):
    query_id, grouped_df = grouping
    query_text = grouped_df['query'].tolist()[0]
    grouped_df.sort_values(col_gain, ascending=False, inplace=True)
    text_sorted_by_relevance = zip(
        grouped_df[col_gain].tolist(), grouped_df[col_product_title].tolist()
    )
    pos_neg_pair = list(
        itertools.filterfalse(
            lambda pair: pair[0][0]<=pair[1][0], product(text_sorted_by_relevance,repeat=2))
    )
    pos_neg_pair = [(t[0][1], t[1][1]) for t in pos_neg_pair]
#     rel = grouped_df[grouped_df.gain==1.0]['product_title'].tolist()
#     non_rel = grouped_df[grouped_df.gain<1.0]['product_title'].tolist()
#     pos_neg_pair = list(product(rel, non_rel))
#     print(pos_neg_pair)
    query = [query_text]*len(pos_neg_pair)
    pos, neg = map(list, zip(*pos_neg_pair))
    triples = []
    for q,p,n in zip(query, pos, neg):
#         triples.append((q,p,n))
        triples.append(InputExample(texts=[q, p, n]))
    return triples
#     return pd.DataFrame.from_dict({'query': query, 'pairs': pos_neg_pair})
groups = df_sample.groupby('query_id')
triplets = []
for g in groups:
    triplets.extend(my_func(g))
    break
# list(triplets)

In [65]:
# triplets

In [66]:
train_triplets = []
groups = df_train.groupby('query_id')
for g in groups:
    train_triplets.extend(my_func(g))

In [68]:
train_triplets[20].texts

['!awnmower tires without rims',
 'maxauto pcs .- lawn mower tire for garden tractors ridings,tubeless',
 'honda -- front wheels, (set of )']

In [76]:
device = "mps"
train_batch_size = 64
model_save_path = f"./models/task_1_ranking_model_triplet_v2/{locale}"

In [77]:
train_dataloader = DataLoader(train_triplets, shuffle=True, batch_size=train_batch_size)

In [80]:
from sentence_transformers import losses
from sentence_transformers import SentenceTransformer

model_id = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_id, device="mps")

# model_name = 'all-distilroberta-v1'
# model = SentenceTransformer(model_name, device=device)
train_loss = losses.TripletLoss(model=model)

In [81]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1) 

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/38611 [00:00<?, ?it/s]

In [83]:
model_save_path = './models/task_1_ranking_model_triplet_v3/us'

In [84]:
model.save(model_save_path)

In [22]:
groups = df_train.groupby('query_id')
train_samples = []
for query_id, group_df in groups:
    query_text = group_df[col_query].iloc[0]
#     print(group_df)
    pos_rows = group_df[group_df[col_gain]==1.0]
    pos_text = pos_rows[col_product_title].tolist()
    train_samples.append(
        InputExample(texts=[query_text], label=query_id)
    )
    for pt in pos_text:
        train_samples.append(
            InputExample(texts=[pt], label=query_id)
        )
#     break
len(train_samples)

200769

In [23]:
groups = df_dev.groupby('query_id')
dev_samples = []
print(len(groups))
for query_id, group_df in groups:
    query_text = group_df[col_query].iloc[0]
#     print(group_df)
    pos_rows = group_df[group_df[col_gain]==1.0]
    pos_text = pos_rows[col_product_title].tolist()
    dev_samples.append(
        InputExample(texts=[query_text], label=query_id)
    )
    for i,row in group_df.iterrows():
        if row[col_gain] == 1.0:
            dev_samples.append(
                InputExample(texts=[row[col_product_title], row[col_gain]], label=query_id)
            )

#     for pt in pos_text:
#         dev_samples.append(
#             InputExample(texts=[pt, ], label=query_id)
#         )
#     break
len(dev_samples)

200


1938

In [None]:
# dev_samples[1].texts

In [None]:
# old strategy of creating triplets
# for (_, row) in df_train.iterrows():
#     if row[col_gain]==1.0:
#         train_samples.append(InputExample(texts=[row[col_product_title]], label=float(row[col_query_id])))
# len(train_samples)

In [None]:
# old strategy of creating triplets
# dev_samples = []
# for (_, row) in df_dev.iterrows():
#     if row[col_gain]==1.0:
#         dev_samples.append(InputExample(texts=[row[col_product_title]], label=float(row[col_query_id])))
# len(dev_samples)

In [24]:
from sentence_transformers.datasets import SentenceLabelDataset
train_data_sampler = SentenceLabelDataset(train_samples)
train_dataloader = DataLoader(train_data_sampler, batch_size=train_batch_size, drop_last=True)

In [25]:
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/other/training_batch_hard_trec.py
from collections import defaultdict
import random
def triplets_from_labeled_dataset(input_examples):
    # Create triplets for a [(label, sentence), (label, sentence)...] dataset
    # by using each example as an anchor and selecting randomly a
    # positive instance with the same label and a negative instance with a different label
    triplets = []
    label2sentence = defaultdict(list)
    for inp_example in input_examples:
        label2sentence[inp_example.label].append(inp_example)

    for i,inp_example in enumerate(input_examples):
        
        anchor = inp_example

        if len(label2sentence[inp_example.label]) < 2: #We need at least 2 examples per label to create a triplet
            continue

        positive = None
        count=0
        while positive is None or positive.texts[0] == anchor.texts[0]:
            count += 1
            positive = random.choice(label2sentence[inp_example.label])
            if count > 100:
                print(i, inp_example)
                break
                
        negative = None
        count = 0
        while negative is None or negative.texts[0] == anchor.texts[0]:
            count += 1
            if count > 100:
                print(i, inp_example)
                break
            negative = random.choice(input_examples)

        triplets.append(InputExample(texts=[anchor.texts[0], positive.texts[0], negative.texts[0]]))

    return triplets
dev_triplets = triplets_from_labeled_dataset(dev_samples)

In [26]:
len(dev_triplets)

1938

In [27]:
dev_triplets[0].texts

['- withnot hole',
 'stylemafiafuel line fuel pressure barbed push lock t-fitting /- npt sensor port,/ hose clamp stainless steel fits hose withid inner diameter includes pse clamps',
 'underwater kinetics eled () dive light, safety yellow']

In [33]:
device='cpu'

In [42]:
model_name = 'all-distilroberta-v1'
model = SentenceTransformer(model_name, device=device)

In [43]:
from sentence_transformers.losses import BatchHardTripletLossDistanceFunction
train_loss = losses.BatchAllTripletLoss(
    model=model, distance_metric=BatchHardTripletLossDistanceFunction.cosine_distance
)

In [44]:
from sentence_transformers.evaluation import TripletEvaluator
dev_evaluator = TripletEvaluator.from_input_examples(dev_triplets, name='dev-set')

In [45]:
print("Performance before fine-tuning:")
dev_evaluator(model)

Performance before fine-tuning:


0.9607843137254902

In [49]:
warmup_steps = int(len(train_samples) * num_epochs  * 0.1)  # 10% of train data
warmup_steps, len(train_samples)

(20076, 200769)

In [50]:
num_epochs = 1
warmup_steps = 5000 # int(len(train_dataloader) * num_epochs  * 0.2)  # 10% of train data

# Train the model
lr = 1e-6
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=f"{model_save_path}_tmp",
    optimizer_params={'lr': lr},
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3137 [00:00<?, ?it/s]

In [51]:
model_save_path

'./models/task_1_ranking_model_triplet_v1/us'

In [52]:
model.save(model_save_path)

In [53]:
print("Performance after fine-tuning:")
dev_evaluator(model)

Performance after fine-tuning:


0.9607843137254902

In [85]:
 """ 0. Init variables """
import numpy as np
col_query = "query"
col_query_id = "query_id"
col_product_id = "product_id" 
col_product_title = "product_title"
col_product_locale = "product_locale"
col_small_version = "small_version"
col_split = "split"
col_scores = "scores"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

""" 1. Load data """    
df_examples = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_examples.parquet'))
df_products = pd.read_parquet(os.path.join(dataset_path, 'shopping_queries_dataset_products.parquet'))

df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=[col_product_locale, col_product_id],
    right_on=[col_product_locale, col_product_id]
)
df_examples_products = df_examples_products[df_examples_products[col_small_version] == 1]
df_examples_products = df_examples_products[df_examples_products[col_split] == "test"]
df_examples_products = df_examples_products[df_examples_products[col_product_locale] == locale]
df_examples_products_clean = df_examples_products.copy()
df_examples_products_clean['query'] = df_examples_products_clean['query'].apply(clean_text_column) 
df_examples_products_clean['product_title'] = df_examples_products_clean['product_title'].apply(clean_text_column) 
features_query = df_examples_products_clean[col_query].to_list()
features_product = df_examples_products_clean[col_product_title].to_list()
n_examples = len(features_query)
scores = np.zeros(n_examples)

In [86]:
from tqdm import tqdm

def embed_query_products(grouping):
    i, group_df = grouping
    features_query = group_df['query']
    features_product = group_df['product_title']
    q_ids = [i]*len(group_df)
    p_ids = group_df['product_id']
    query_emb = torch.tensor(model.encode(features_query.tolist()))
    prod_emb = torch.tensor(model.encode(features_product.tolist()))
    q_scores = torch.diagonal(torch.mm(query_emb, prod_emb.transpose(0, 1)).to('cpu'))
    return pd.DataFrame({col_query_id: q_ids, col_product_id: p_ids, col_scores: q_scores})
groups = df_examples_products_clean.groupby('query_id')
triplets = []
for g in tqdm(groups):
    triplets.append(embed_query_products(g))

result_df = pd.concat(triplets)

100%|████████████████████████████████████████████████████████████████████████| 8956/8956 [48:58<00:00,  3.05it/s]


In [87]:
df_hypothesis = result_df.sort_values(by=[col_query_id, col_scores], ascending=False)
df_hypothesis.head(50)

Unnamed: 0,query_id,product_id,scores
2614586,130378,B08181G6MP,0.996258
2614585,130378,B01E7KBXWC,0.995996
2614580,130378,B077XC7K7V,0.995936
2614578,130378,B083XVHBVR,0.995841
2614589,130378,B00NAGVL7W,0.995625
2614581,130378,B074DK734G,0.995611
2614593,130378,B081X6DRRT,0.995533
2614592,130378,B0010POWEE,0.995264
2614583,130378,B00KFKDOYO,0.9949
2614590,130378,B00HJZT0YQ,0.993277


In [92]:
""" 4. Prepare hypothesis file """
HYPOTHESIS_TASK1_PATH="./hypothesis"
HYPOTHESIS_PATH_FILE=f"{HYPOTHESIS_TASK1_PATH}/task_1_ranking_model_fine_tuned_triplet_v3_{locale}.csv"

In [93]:
df_hypothesis[[col_query_id, col_product_id]].to_csv(
    HYPOTHESIS_PATH_FILE,
    index=False,
    sep=',',
)