In [369]:
import os
import argparse
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv
import fasttext

from logger import logger

In [9]:
# Useful if you want to perform stemming.
import nltk
stemmer = nltk.stem.PorterStemmer()

categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'
queries_file_name = r'/workspace/datasets/train.csv'
output_file_name = r'/workspace/datasets/fasttext/labeled_queries.txt'

## Level 1, Task 1: Prune the category taxonomy

In [1]:
!python create_labeled_queries.py

In [4]:
# Number of words
!wc /workspace/datasets/fasttext/labeled_queries.txt

 1854998  5396970 67889572 /workspace/datasets/fasttext/labeled_queries.txt


In [5]:
# Number of distinct labels
!cut -d' ' -f1 /workspace/datasets/fasttext/labeled_queries.txt | sort | uniq | wc

   1486    1486   37488


### Read queries

In [328]:
# The root category, named Best Buy with id cat00000, doesn't have a parent.
root_category_id = 'cat00000'

tree = ET.parse(categories_file_name)
root = tree.getroot()

# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])

# Read the training data into pandas, only keeping queries with non-root categories in our category tree.
queries_df = pd.read_csv(queries_file_name)[['category', 'query']]
queries_df = queries_df[queries_df['category'].isin(categories)]

In [329]:
queries_df[queries_df['query'] == 'Beats By Dr. Dre- Monster Pro Over-the-Ear Headphones -']

Unnamed: 0,category,query
1855395,pcmcat144700050004,Beats By Dr. Dre- Monster Pro Over-the-Ear Hea...


In [330]:
queries_df['query'].loc[1855395]

'Beats By Dr. Dre- Monster Pro Over-the-Ear Headphones -'

In [331]:
# Convert queries to lower case, replace non-alphanumeric chars with a space and replace consecutive spaces with a single space
queries_df['query'] = queries_df['query'].str.lower().str.replace('[^a-z0-9]+', ' ', regex=True).str.replace(' +', ' ', regex=True)

# Use the nltk stemmer to stem all query tokens
queries_df['query'] = queries_df['query'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

In [332]:
queries_df['query'].loc[1855395]

'beat by dr dre monster pro over the ear headphon'

In [91]:
min_queries = 5

In [335]:
query_count = queries_df.groupby('category').agg(query_count=('query', 'count')).reset_index()
assert query_count.loc[query_count['category'] == 'abcat0701001', 'query_count'].sum() == 13830

In [340]:
min_queries = 10

while True:
    # Count the number of rows per query
    query_count = queries_df.groupby('category').agg(query_count=('query', 'count')).reset_index()
    
    # Merge in query counts and parent category. Substitute parent category with root category id if missing
    queries_df = queries_df.merge(query_count, on='category', how='left').merge(parents_df, on='category', how='left')
    queries_df.loc[queries_df['parent'].isnull(), 'parent'] = root_category_id
    
    # If category count less than min_queries, update category with parent
    queries_df.loc[queries_df['query_count'] < min_queries, 'category'] = queries_df['parent']
    
    # Keep only necessary queries
    queries_df = queries_df[['category', 'query']]
    
    # If no query counts are below min queries, end loop
    if (query_count['query_count'] < min_queries).sum() == 0:
        break

### Create labeled queries again with new code

In [401]:
!python create_labeled_queries.py --min_queries 1000

2023-03-18 19:54:04,402 - Number of queries: 1,854,998
2023-03-18 19:55:22,289 - Queries normalized
2023-03-18 19:55:29,619 - Categories rolled up


In [402]:
# Number of words
!wc /workspace/datasets/fasttext/labeled_queries.txt

 1850373  5497256 66082328 /workspace/datasets/fasttext/labeled_queries.txt


In [403]:
# Number of distinct labels (387)
!cut -d' ' -f1 /workspace/datasets/fasttext/labeled_queries.txt | sort | uniq | wc

    387     387    9422


## Level 1, Task 2: Train a query classifier

In [404]:
!head /workspace/datasets/fasttext/labeled_queries.txt

__label__abcat0101001 television panason 50 pulgada
__label__abcat0101001 sharp
__label__pcmcat193100050014 nook
__label__abcat0101001 rca
__label__abcat0101005 rca
__label__pcmcat143200050016 flat screen tv
__label__pcmcat247400050001 macbook
__label__pcmcat171900050028 blue tooth headphon
__label__abcat0107004 tv antenna
__label__pcmcat186100050006 memori card


In [387]:
# Shuffle data (works fine in terminal)
# !shuf /workspace/datasets/fasttext/labeled_queries.txt --random-source=<(seq 1999999) > /workspace/datasets/fasttext/shuffled_queries.txt

In [405]:
!head /workspace/datasets/fasttext/shuffled_queries.txt

__label__cat02724 battlestar galactica
__label__abcat0715001 xbox 360 power cord
__label__pcmcat223000050008 pioneer
__label__cat09000 kodak
__label__pcmcat144700050004 skull candi earbud
__label__pcmcat218000050001 ipad 2 keyboard portfolio
__label__abcat0106004 sanu
__label__abcat0107004 antenna
__label__abcat0503002 digit frame
__label__abcat0911006 electrolux vacuum


In [406]:
# Create training and testing splits
!head -50000 /workspace/datasets/fasttext/shuffled_queries.txt > /workspace/datasets/fasttext/queries_train.txt
!tail -10000 /workspace/datasets/fasttext/shuffled_queries.txt > /workspace/datasets/fasttext/queries_test.txt

In [407]:
!head -5 /workspace/datasets/fasttext/queries_train.txt

__label__cat02724 battlestar galactica
__label__abcat0715001 xbox 360 power cord
__label__pcmcat223000050008 pioneer
__label__cat09000 kodak
__label__pcmcat144700050004 skull candi earbud


In [408]:
query_classifier = fasttext.train_supervised(input='/workspace/datasets/fasttext/queries_train.txt')

Read 0M words
Number of words:  7542
Number of labels: 387
Progress: 100.0% words/sec/thread:    7417 lr:  0.000000 avg.loss:  4.112877 ETA:   0h 0m 0s avg.loss:  4.112877 ETA:   0h 0m 0s


In [409]:
for k in [1, 3, 5]:
    n, prec, rec = query_classifier.test(path='/workspace/datasets/fasttext/queries_test.txt', k=k)
    logger.info(f'Evaluation on {n:,} samples - precision@{k}: {prec:.4f}, recall@{k}: {rec:.4f}')

2023-03-18 19:56:02,131 - Evaluation on 10,000 samples - precision@1: 0.4797, recall@1: 0.4797
2023-03-18 19:56:02,426 - Evaluation on 10,000 samples - precision@3: 0.2155, recall@3: 0.6466
2023-03-18 19:56:02,725 - Evaluation on 10,000 samples - precision@5: 0.1416, recall@5: 0.7080


In [410]:
query_classifier = fasttext.train_supervised(input='/workspace/datasets/fasttext/queries_train.txt', 
                                             lr=0.5, wordNgrams=2, epoch=25)

Read 0M words
Number of words:  7542
Number of labels: 387
Progress: 100.0% words/sec/thread:   17506 lr:  0.000000 avg.loss:  2.249974 ETA:   0h 0m 0s% words/sec/thread:   17620 lr:  0.130218 avg.loss:  2.611910 ETA:   0h 0m 4s


In [411]:
for k in [1, 3, 5]:
    n, prec, rec = query_classifier.test(path='/workspace/datasets/fasttext/queries_test.txt', k=k)
    logger.info(f'Evaluation on {n:,} samples - precision@{k}: {prec:.4f}, recall@{k}: {rec:.4f}')

2023-03-18 19:56:22,490 - Evaluation on 10,000 samples - precision@1: 0.5253, recall@1: 0.5253
2023-03-18 19:56:22,782 - Evaluation on 10,000 samples - precision@3: 0.2382, recall@3: 0.7146
2023-03-18 19:56:23,079 - Evaluation on 10,000 samples - precision@5: 0.1559, recall@5: 0.7794


### Create labeled queries again with 10k min queries

In [395]:
!python create_labeled_queries.py --min_queries 10000

2023-03-18 19:50:59,359 - Number of queries: 1,854,998
2023-03-18 19:52:17,020 - Queries normalized
2023-03-18 19:52:24,417 - Categories rolled up


In [396]:
# Number of words
!wc /workspace/datasets/fasttext/labeled_queries.txt

 1781690  5298199 61947635 /workspace/datasets/fasttext/labeled_queries.txt


In [397]:
# Number of distinct labels (387)
!cut -d' ' -f1 /workspace/datasets/fasttext/labeled_queries.txt | sort | uniq | wc

     69      69    1614


In [None]:
# !shuf /workspace/datasets/fasttext/labeled_queries.txt --random-source=<(seq 1999999) > /workspace/datasets/fasttext/shuffled_queries.txt

In [398]:
# Create training and testing splits
!head -50000 /workspace/datasets/fasttext/shuffled_queries.txt > /workspace/datasets/fasttext/queries_train.txt
!tail -10000 /workspace/datasets/fasttext/shuffled_queries.txt > /workspace/datasets/fasttext/queries_test.txt

In [399]:
query_classifier = fasttext.train_supervised(input='/workspace/datasets/fasttext/queries_train.txt', 
                                             lr=0.5, wordNgrams=2, epoch=25)

Read 0M words
Number of words:  7497
Number of labels: 69
Progress: 100.0% words/sec/thread:   62552 lr:  0.000000 avg.loss:  1.616237 ETA:   0h 0m 0s


In [400]:
for k in [1, 3, 5]:
    n, prec, rec = query_classifier.test(path='/workspace/datasets/fasttext/queries_test.txt', k=k)
    logger.info(f'Evaluation on {n:,} samples - precision@{k}: {prec:.4f}, recall@{k}: {rec:.4f}')

2023-03-18 19:53:49,807 - Evaluation on 10,000 samples - precision@1: 0.5893, recall@1: 0.5893
2023-03-18 19:53:49,873 - Evaluation on 10,000 samples - precision@3: 0.2624, recall@3: 0.7873
2023-03-18 19:53:49,941 - Evaluation on 10,000 samples - precision@5: 0.1686, recall@5: 0.8432


## Level 2, Task 1: Add the query classifier to query processing