In [10]:
import pandas as pd 
import numpy as np
import warnings

from sentence_transformers import SentenceTransformer
from preprocessor import *
from pattern_search import * 
from ranker import BM25Ranker
import pickle
import joblib

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
from pathlib import Path
PATH = Path("")
data_path = PATH.home()/'data/Target/data/'
model_path = PATH.home()/'data/Target/models/group10'

In [5]:
# Get grocery product hierarchy information
group10 = pd.read_csv(data_path/'group10_header.csv',
                      sep='\t', 
                      low_memory=False)

# Get scraped information for the above products
products = pd.read_csv(data_path/'products.csv')

# Merge scraped information into the hierarchy table
group10 = pd.merge(group10, products, 
                   how = 'left', on = 'tcin')

# Preprocess the table
group10 = preprocess_df(group10)

# Fill NAs
group10 = fillNa(group10)

### Generate embeddings for transformer model

In [6]:
# Selected model for generating embeddings
lm = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [7]:
# Get list of preprocessed product titles
product_ids = group10['tcin'].values

In [8]:
# Concatenate product title text with hierarchy information
group10['department_name'] = np.where(pd.isnull(group10['department_name']), '', group10['department_name'])
group10['class_name'] = np.where(pd.isnull(group10['class_name']), '', group10['class_name'])
group10['subclass_name'] = np.where(pd.isnull(group10['subclass_name']), '', group10['subclass_name'])
group10['style_name'] = np.where(pd.isnull(group10['style_name']), '', group10['style_name'])
group10['item_type_name'] = np.where(pd.isnull(group10['item_type_name']), '', group10['item_type_name'])

product_sentences = list(group10['division_name'].str.cat(group10[['department_name', 'class_name', 
                                                                'item_type_name',                                                                  
                                                                'subclass_name', 
                                                                'style_name', 'title_processed'
                                                        ]], 
                                             sep= ' ').str.lower().values)

In [10]:
# Compute embeddings for L1 Ranker
pool = lm.start_multi_process_pool(target_devices=['cpu']*8)
emb = lm.encode_multi_process(product_sentences, pool)
# Store sentences & embeddings on disc
with open('data/hier_embeddings.pkl', "wb") as fOut:
    pickle.dump({'ids': product_ids,
                 'sentences': product_sentences, 
                 'embeddings': emb}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
#Load sentences & embeddings from disc
with open('data/hier_embeddings.pkl', "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_ids = stored_data['ids']
    stored_sentences = stored_data['sentences']
    stored_embeddings = stored_data['embeddings']

### Generate embeddings for BM25 model

In [None]:
bm25 = BM25Ranker(product_ids=group10['tcin'], max_rank=10)
texts = bm25.fit_corpus(product_sentences, op_path='data/bm25_corpus_no_description')
texts = joblib.load("data/bm25_corpus_no_description")
bm25.fit(texts)
joblib.dump(bm25, "./models/bm25_no_description")