In [1]:
%load_ext autoreload
%autoreload 2

In [91]:
import os
import argparse
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv
import fasttext
from opensearchpy import OpenSearch
from getpass import getpass
from itertools import zip_longest

#from logger import logger

In [2]:
# Useful if you want to perform stemming.
import nltk
stemmer = nltk.stem.PorterStemmer()

categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'
queries_file_name = r'/workspace/datasets/train.csv'
output_file_name = r'/workspace/datasets/fasttext/labeled_queries.txt'

## Level 1, Task 1: Prune the category taxonomy

In [1]:
!python create_labeled_queries.py

In [4]:
# Number of words
!wc /workspace/datasets/fasttext/labeled_queries.txt

 1854998  5396970 67889572 /workspace/datasets/fasttext/labeled_queries.txt


In [5]:
# Number of distinct labels
!cut -d' ' -f1 /workspace/datasets/fasttext/labeled_queries.txt | sort | uniq | wc

   1486    1486   37488


### Read queries

In [328]:
# The root category, named Best Buy with id cat00000, doesn't have a parent.
root_category_id = 'cat00000'

tree = ET.parse(categories_file_name)
root = tree.getroot()

# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])

# Read the training data into pandas, only keeping queries with non-root categories in our category tree.
queries_df = pd.read_csv(queries_file_name)[['category', 'query']]
queries_df = queries_df[queries_df['category'].isin(categories)]

In [329]:
queries_df[queries_df['query'] == 'Beats By Dr. Dre- Monster Pro Over-the-Ear Headphones -']

Unnamed: 0,category,query
1855395,pcmcat144700050004,Beats By Dr. Dre- Monster Pro Over-the-Ear Hea...


In [330]:
queries_df['query'].loc[1855395]

'Beats By Dr. Dre- Monster Pro Over-the-Ear Headphones -'

In [331]:
# Convert queries to lower case, replace non-alphanumeric chars with a space and replace consecutive spaces with a single space
queries_df['query'] = queries_df['query'].str.lower().str.replace('[^a-z0-9]+', ' ', regex=True).str.replace(' +', ' ', regex=True)

# Use the nltk stemmer to stem all query tokens
queries_df['query'] = queries_df['query'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [332]:
queries_df['query'].loc[1855395]

'beat by dr dre monster pro over the ear headphon'

In [91]:
min_queries = 5

In [335]:
query_count = queries_df.groupby('category').agg(query_count=('query', 'count')).reset_index()
assert query_count.loc[query_count['category'] == 'abcat0701001', 'query_count'].sum() == 13830

In [340]:
min_queries = 10

while True:
    # Count the number of rows per query
    query_count = queries_df.groupby('category').agg(query_count=('query', 'count')).reset_index()
    
    # Merge in query counts and parent category. Substitute parent category with root category id if missing
    queries_df = queries_df.merge(query_count, on='category', how='left').merge(parents_df, on='category', how='left')
    queries_df.loc[queries_df['parent'].isnull(), 'parent'] = root_category_id
    
    # If category count less than min_queries, update category with parent
    queries_df.loc[queries_df['query_count'] < min_queries, 'category'] = queries_df['parent']
    
    # Keep only necessary queries
    queries_df = queries_df[['category', 'query']]
    
    # If no query counts are below min queries, end loop
    if (query_count['query_count'] < min_queries).sum() == 0:
        break

### Create labeled queries again with new code

In [401]:
!python create_labeled_queries.py --min_queries 1000

2023-03-18 19:54:04,402 - Number of queries: 1,854,998
2023-03-18 19:55:22,289 - Queries normalized
2023-03-18 19:55:29,619 - Categories rolled up


In [402]:
# Number of words
!wc /workspace/datasets/fasttext/labeled_queries.txt

 1850373  5497256 66082328 /workspace/datasets/fasttext/labeled_queries.txt


In [403]:
# Number of distinct labels (387)
!cut -d' ' -f1 /workspace/datasets/fasttext/labeled_queries.txt | sort | uniq | wc

    387     387    9422


## Level 1, Task 2: Train a query classifier

In [404]:
!head /workspace/datasets/fasttext/labeled_queries.txt

__label__abcat0101001 television panason 50 pulgada
__label__abcat0101001 sharp
__label__pcmcat193100050014 nook
__label__abcat0101001 rca
__label__abcat0101005 rca
__label__pcmcat143200050016 flat screen tv
__label__pcmcat247400050001 macbook
__label__pcmcat171900050028 blue tooth headphon
__label__abcat0107004 tv antenna
__label__pcmcat186100050006 memori card


In [387]:
# Shuffle data (works fine in terminal)
# !shuf /workspace/datasets/fasttext/labeled_queries.txt --random-source=<(seq 1999999) > /workspace/datasets/fasttext/shuffled_queries.txt

In [405]:
!head /workspace/datasets/fasttext/shuffled_queries.txt

__label__cat02724 battlestar galactica
__label__abcat0715001 xbox 360 power cord
__label__pcmcat223000050008 pioneer
__label__cat09000 kodak
__label__pcmcat144700050004 skull candi earbud
__label__pcmcat218000050001 ipad 2 keyboard portfolio
__label__abcat0106004 sanu
__label__abcat0107004 antenna
__label__abcat0503002 digit frame
__label__abcat0911006 electrolux vacuum


In [406]:
# Create training and testing splits
!head -50000 /workspace/datasets/fasttext/shuffled_queries.txt > /workspace/datasets/fasttext/queries_train.txt
!tail -10000 /workspace/datasets/fasttext/shuffled_queries.txt > /workspace/datasets/fasttext/queries_test.txt

In [407]:
!head -5 /workspace/datasets/fasttext/queries_train.txt

__label__cat02724 battlestar galactica
__label__abcat0715001 xbox 360 power cord
__label__pcmcat223000050008 pioneer
__label__cat09000 kodak
__label__pcmcat144700050004 skull candi earbud


In [3]:
query_classifier = fasttext.train_supervised(input='/workspace/datasets/fasttext/queries_train.txt')

Read 0M words
Number of words:  7542
Number of labels: 387
Progress: 100.0% words/sec/thread:    7264 lr:  0.000000 avg.loss:  4.181074 ETA:   0h 0m 0s


In [4]:
for k in [1, 3, 5]:
    n, prec, rec = query_classifier.test(path='/workspace/datasets/fasttext/queries_test.txt', k=k)
    logger.info(f'Evaluation on {n:,} samples - precision@{k}: {prec:.4f}, recall@{k}: {rec:.4f}')

2023-03-18 22:16:34,503 - Evaluation on 10,000 samples - precision@1: 0.4806, recall@1: 0.4806
2023-03-18 22:16:34,807 - Evaluation on 10,000 samples - precision@3: 0.2163, recall@3: 0.6489
2023-03-18 22:16:35,142 - Evaluation on 10,000 samples - precision@5: 0.1414, recall@5: 0.7072


In [5]:
query_classifier = fasttext.train_supervised(input='/workspace/datasets/fasttext/queries_train.txt', 
                                             lr=0.5, wordNgrams=2, epoch=25)

Read 0M words
Number of words:  7542
Number of labels: 387
Progress: 100.0% words/sec/thread:   16086 lr:  0.000000 avg.loss:  2.231277 ETA:   0h 0m 0s 94.4% words/sec/thread:   16127 lr:  0.028110 avg.loss:  2.301684 ETA:   0h 0m 1s


In [6]:
for k in [1, 3, 5]:
    n, prec, rec = query_classifier.test(path='/workspace/datasets/fasttext/queries_test.txt', k=k)
    logger.info(f'Evaluation on {n:,} samples - precision@{k}: {prec:.4f}, recall@{k}: {rec:.4f}')

2023-03-18 22:16:56,703 - Evaluation on 10,000 samples - precision@1: 0.5274, recall@1: 0.5274
2023-03-18 22:16:57,009 - Evaluation on 10,000 samples - precision@3: 0.2380, recall@3: 0.7139
2023-03-18 22:16:57,326 - Evaluation on 10,000 samples - precision@5: 0.1557, recall@5: 0.7786


In [8]:
query_classifier.save_model('/workspace/models/query_classifier.bin')

### Create labeled queries again with 10k min queries

In [395]:
!python create_labeled_queries.py --min_queries 10000

2023-03-18 19:50:59,359 - Number of queries: 1,854,998
2023-03-18 19:52:17,020 - Queries normalized
2023-03-18 19:52:24,417 - Categories rolled up


In [396]:
# Number of words
!wc /workspace/datasets/fasttext/labeled_queries.txt

 1781690  5298199 61947635 /workspace/datasets/fasttext/labeled_queries.txt


In [397]:
# Number of distinct labels (387)
!cut -d' ' -f1 /workspace/datasets/fasttext/labeled_queries.txt | sort | uniq | wc

     69      69    1614


In [None]:
# !shuf /workspace/datasets/fasttext/labeled_queries.txt --random-source=<(seq 1999999) > /workspace/datasets/fasttext/shuffled_queries.txt

In [398]:
# Create training and testing splits
!head -50000 /workspace/datasets/fasttext/shuffled_queries.txt > /workspace/datasets/fasttext/queries_train.txt
!tail -10000 /workspace/datasets/fasttext/shuffled_queries.txt > /workspace/datasets/fasttext/queries_test.txt

In [399]:
query_classifier = fasttext.train_supervised(input='/workspace/datasets/fasttext/queries_train.txt', 
                                             lr=0.5, wordNgrams=2, epoch=25)

Read 0M words
Number of words:  7497
Number of labels: 69
Progress: 100.0% words/sec/thread:   62552 lr:  0.000000 avg.loss:  1.616237 ETA:   0h 0m 0s


In [400]:
for k in [1, 3, 5]:
    n, prec, rec = query_classifier.test(path='/workspace/datasets/fasttext/queries_test.txt', k=k)
    logger.info(f'Evaluation on {n:,} samples - precision@{k}: {prec:.4f}, recall@{k}: {rec:.4f}')

2023-03-18 19:53:49,807 - Evaluation on 10,000 samples - precision@1: 0.5893, recall@1: 0.5893
2023-03-18 19:53:49,873 - Evaluation on 10,000 samples - precision@3: 0.2624, recall@3: 0.7873
2023-03-18 19:53:49,941 - Evaluation on 10,000 samples - precision@5: 0.1686, recall@5: 0.8432


## Level 2, Task 1: Add the query classifier to query processing

In [41]:
prob_threshold = 0.5

In [9]:
query_classifier = fasttext.load_model('/workspace/models/query_classifier.bin')



In [39]:
categories, scores = query_classifier.predict('lollipop', k=5)

In [60]:
cum_prob = 0
category_list = []

for category, score in zip(categories, scores):
    logger.info(f'Category: {category}, Score: {score}')
    
    category_list.append(category.replace('__label__', ''))
    cum_prob += score
    
    if cum_prob > prob_threshold:
        break

2023-03-18 23:38:04,272 - Category: __label__cat02015, Score: 0.3939267098903656
2023-03-18 23:38:04,273 - Category: __label__cat02009, Score: 0.09578841924667358
2023-03-18 23:38:04,273 - Category: __label__cat09000, Score: 0.04979415610432625


In [61]:
category_list

['cat02015', 'cat02009', 'cat09000']

In [85]:
!python ../utilities/query.py --query xbox --synonyms

INFO:Querying via synonyms
{
  "took": 170,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 10000,
      "relation": "gte"
    },
    "max_score": 881.81964,
    "hits": [
      {
        "_index": "bbuy_products",
        "_id": "2613542",
        "_score": 881.81964,
        "_source": {
          "categoryPathIds": [
            "cat00000",
            "abcat0700000",
            "abcat0701000",
            "abcat0701002"
          ],
          "name": [
            "Assassin's Creed: Revelations - Xbox 360"
          ],
          "shortDescription": [
            "Prepare for the final chapter of this award-winning trilogy"
          ]
        }
      },
      {
        "_index": "bbuy_products",
        "_id": "9854804",
        "_score": 675.6224,
        "_source": {
          "categoryPathIds": [
            "cat00000",
            "abcat0700000",
            "abcat0701000

In [84]:
!python ../utilities/query.py --query xbox --query_filter



INFO:Classification result: ('__label__abcat0701001',), [0.54385275]
INFO:Category: __label__abcat0701001, Score: 0.5438527464866638
INFO:Category list: ['abcat0701001'], Cumulative probability: 0.5438527464866638
{
  "took": 10,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 69,
      "relation": "eq"
    },
    "max_score": 568.3101,
    "hits": [
      {
        "_index": "bbuy_products",
        "_id": "1162184",
        "_score": 568.3101,
        "_source": {
          "categoryPathIds": [
            "cat00000",
            "abcat0700000",
            "abcat0701000",
            "abcat0701001"
          ],
          "name": [
            "Microsoft - Xbox 360 4GB Console - Black"
          ],
          "shortDescription": [
            "Expand your gaming universe 360\u00b0"
          ]
        }
      },
      {
        "_index": "bbuy_products",
        "_id": "1162439",

### Check filter impact

In [3]:
import sys
sys.path.append('../utilities/')

In [15]:
from IPython.display import HTML

In [4]:
from query import search, logger



In [5]:
host = 'localhost'
port = 9200
password = getpass()
auth = ('admin', 'admin')

opensearch = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress=True,  # enables gzip compression for request bodies
    http_auth=auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl=True,
    verify_certs=False,  # set to true if you have certs
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

index = 'bbuy_products'

········


In [72]:
def html_hit(name, filtered_name, image_url, filtered_image_url):
    return f'''
        <div style="display: flex;">
          <!-- Left column -->
          <div style="flex: 1; margin-right: 20px;">
            <div>
              <img src="{image_url}" alt="{name}" style="max-width: 50%; max-height: 200px">
            </div>
            <p>{name}</p>
          </div>
          <!-- Right column -->
          <div style="flex: 1; margin-left: 20px;">
            <div>
              <img src="{filtered_image_url}" alt="{filtered_name}" style="max-width: 50%; max-height: 200px">
            </div>
            <p>{filtered_name}</p>
          </div>
        </div>
    '''

In [73]:
query = 'xbox'

result = search(client=opensearch, index=index, user_query=query, query_filter=False)
result_with_filter = search(client=opensearch, index=index, user_query=query, query_filter=True)

html = ''
for hit, filtered_hit in zip(result['hits']['hits'], result_with_filter['hits']['hits']):
    name, filtered_name = hit['_source']['name'][0], filtered_hit['_source']['name'][0]
    image_url, filtered_image_url = hit['_source']['image'][0], filtered_hit['_source']['image'][0]
    logger.debug(f'Name: {name}, Filtered name: {filtered_name}')
    logger.debug(f'Image URL: {image_url}, Filter image URL: {filtered_image_url}')
    html += html_hit(name, filtered_name, image_url, filtered_image_url) 

logger.info('Left: Results without filter, Right: Results with query classification filter')
HTML(html)

2023-03-19 00:31:31,096 - Classification result: ('__label__abcat0701001', '__label__abcat0701003', '__label__abcat0715007', '__label__pcmcat232900050029', '__label__abcat0507000'), [0.54385275 0.0694699  0.04747236 0.04197487 0.04113299]
2023-03-19 00:31:31,097 - Category: __label__abcat0701001, Score: 0.5438527464866638
2023-03-19 00:31:31,098 - Category list: ['abcat0701001'], Cumulative probability: 0.5438527464866638
2023-03-19 00:31:31,113 - Left: Results without filter, Right: Results with query classification filter


In [75]:
query = 'phone'

result = search(client=opensearch, index=index, user_query=query, query_filter=False)
result_with_filter = search(client=opensearch, index=index, user_query=query, query_filter=True)

html = ''
for hit, filtered_hit in zip(result['hits']['hits'], result_with_filter['hits']['hits']):
    name, filtered_name = hit['_source']['name'][0], filtered_hit['_source']['name'][0]
    image_url, filtered_image_url = hit['_source']['image'][0], filtered_hit['_source']['image'][0]
    logger.debug(f'Name: {name}, Filtered name: {filtered_name}')
    logger.debug(f'Image URL: {image_url}, Filter image URL: {filtered_image_url}')
    html += html_hit(name, filtered_name, image_url, filtered_image_url) 

logger.info('Left: Results without filter, Right: Results with query classification filter')
HTML(html)

2023-03-19 00:31:32,535 - Classification result: ('__label__pcmcat209400050001', '__label__abcat0404004', '__label__pcmcat162100050040', '__label__pcmcat159300050002', '__label__abcat0208011'), [0.24659158 0.13511917 0.07489123 0.05283668 0.05140621]
2023-03-19 00:31:32,536 - Category: __label__pcmcat209400050001, Score: 0.24659158289432526
2023-03-19 00:31:32,536 - Category: __label__abcat0404004, Score: 0.13511916995048523
2023-03-19 00:31:32,537 - Category: __label__pcmcat162100050040, Score: 0.07489123195409775
2023-03-19 00:31:32,538 - Category: __label__pcmcat159300050002, Score: 0.05283668264746666
2023-03-19 00:31:32,539 - Category list: ['pcmcat209400050001', 'abcat0404004', 'pcmcat162100050040', 'pcmcat159300050002'], Cumulative probability: 0.5094386674463749
2023-03-19 00:31:32,553 - Left: Results without filter, Right: Results with query classification filter


In [101]:
query = 'dress'
name, filtered_name, image_url, filtered_image_url = ['No Result'] * 4

result = search(client=opensearch, index=index, user_query=query, query_filter=False)
result_with_filter = search(client=opensearch, index=index, user_query=query, query_filter=True)

html = ''
for hit, filtered_hit in zip_longest(result['hits']['hits'], result_with_filter['hits']['hits']):
    if hit is not None:
        name, image_url = hit['_source']['name'][0],  hit['_source']['image'][0]
    if filtered_hit is not None:
        filtered_name, filtered_image_url = filtered_hit['_source']['name'][0], filtered_hit['_source']['image'][0]
    logger.debug(f'Name: {name}, Filtered name: {filtered_name}')
    logger.debug(f'Image URL: {image_url}, Filter image URL: {filtered_image_url}')
    html += html_hit(name, filtered_name, image_url, filtered_image_url) 

logger.info('Left: Results without filter, Right: Results with query classification filter')
HTML(html)

2023-03-19 00:46:48,727 - Classification result: ('__label__cat02009', '__label__cat02015', '__label__cat09000', '__label__cat02661', '__label__cat02685'), [0.64068973 0.06253754 0.02229895 0.01158105 0.0099335 ]
2023-03-19 00:46:48,728 - Category: __label__cat02009, Score: 0.6406897306442261
2023-03-19 00:46:48,729 - Category list: ['cat02009'], Cumulative probability: 0.6406897306442261
2023-03-19 00:46:48,737 - Left: Results without filter, Right: Results with query classification filter


In [103]:
query = 'candy'
name, filtered_name, image_url, filtered_image_url = ['No Result'] * 4

result = search(client=opensearch, index=index, user_query=query, query_filter=False)
result_with_filter = search(client=opensearch, index=index, user_query=query, query_filter=True)

html = ''
for hit, filtered_hit in zip_longest(result['hits']['hits'], result_with_filter['hits']['hits']):
    if hit is not None:
        name, image_url = hit['_source']['name'][0],  hit['_source']['image'][0]
    if filtered_hit is not None:
        filtered_name, filtered_image_url = filtered_hit['_source']['name'][0], filtered_hit['_source']['image'][0]
    logger.debug(f'Name: {name}, Filtered name: {filtered_name}')
    logger.debug(f'Image URL: {image_url}, Filter image URL: {filtered_image_url}')
    html += html_hit(name, filtered_name, image_url, filtered_image_url) 

logger.info('Left: Results without filter, Right: Results with query classification filter')
HTML(html)

2023-03-19 00:47:08,038 - Classification result: ('__label__cat02015', '__label__cat02009', '__label__cat09000', '__label__pcmcat247400050000', '__label__abcat0101001'), [0.38669026 0.09733694 0.05076459 0.03645941 0.02815702]
2023-03-19 00:47:08,039 - Category: __label__cat02015, Score: 0.38669025897979736
2023-03-19 00:47:08,040 - Category: __label__cat02009, Score: 0.09733694046735764
2023-03-19 00:47:08,040 - Category: __label__cat09000, Score: 0.05076458677649498
2023-03-19 00:47:08,041 - Category list: ['cat02015', 'cat02009', 'cat09000'], Cumulative probability: 0.53479178622365
2023-03-19 00:47:08,049 - Left: Results without filter, Right: Results with query classification filter
