# Level 1: Query Classification

In [1]:
from fastcore.foundation import L
from fastcore.basics import *
from fastcore.test import *
from nbdev.showdoc import show_doc

In [2]:
import fasttext as ft

In [3]:
import re
import string
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language='english')
_re_spec = re.compile(r'([/#\\-\\.:\'\"])')

def spec_add_spaces(t):
    "Add spaces around \" ' . : - / \ and #"
    return _re_spec.sub(r' \1 ', t)

# Causes the resulting RE to match from m to n repetitions of the preceding RE, attempting to match as many repetitions as possible.
_re_space = re.compile(' {2,}')

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return _re_space.sub(' ', t)

In [4]:
test_eq(spec_add_spaces(".nltk"), ' . nltk')
test_eq(spec_add_spaces("nltk:"), 'nltk : ')
test_eq(spec_add_spaces("nltk'"), "nltk ' ")
test_eq(spec_add_spaces("nltk\""), 'nltk " ')

# Prune the Category Taxonomy

## Transform Queries

Convert the queries to lowercase, strip quotation marks (and perhaps other punctuation), and optionally implement other normalization, like using the nltk stemmer.

In [5]:
def rm_punct(t):
    for p in string.punctuation:
        t = t.replace(p, ' ')
    return t

def transform(query):
    "Transform query by replacing punctuations, removing multiple spaces, stemming, lower casing"
    
    # replace_punct
    query = spec_add_spaces(query)
    
    # remove punct
    query = rm_punct(query)
    
    # replace multiple spaces
    query = rm_useless_spaces(query)
    
    # fix registered, trademark, copyright symbol
    # remove non-ascii characters from query
    query = query.encode(encoding='ascii', errors='ignore').decode()
    
    query = ' '.join([o for o in query.split(' ') if not o.isnumeric()])
    
    # add stemmer
    query = stemmer.stem(query)
    return query.lower()

In [6]:
import os
import argparse
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv

# Useful if you want to perform stemming.
import nltk
stemmer = nltk.stem.PorterStemmer()

categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'

queries_file_name = r'/workspace/datasets/train.csv'
output_file_name = r'/workspace/datasets/labeled_query_data.txt'

# parser = argparse.ArgumentParser(description='Process arguments.')
# general = parser.add_argument_group("general")
# general.add_argument("--min_queries", default=1,  help="The minimum number of queries per category label (default is 1)")
# general.add_argument("--output", default=output_file_name, help="the file to output to")

# args = parser.parse_args()
# output_file_name = args.output

# if args.min_queries:
#     min_queries = int(args.min_queries)
# The root category, named Best Buy with id cat00000, doesn't have a parent.
root_category_id = 'cat00000'

Read the category tree from the categories XML file.

In [7]:
tree = ET.parse(categories_file_name)
root = tree.getroot()

# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])

In [8]:
# Read the training data into pandas, only keeping queries with non-root categories in our category tree.
df = pd.read_csv(queries_file_name)[['category', 'query']]
df = df[df['category'].isin(categories)]

In [9]:
# IMPLEMENTED: Convert queries to lowercase, and optionally implement other normalization, like stemming.
df['query'] = df['query'].apply(transform)

## Query Count of Leaf Categories

Compute the query count of all leaf categories.

What is leaf? 
A category that is not a parent. So it cannot exist in the parent column. For each category check if exists in parent column, then update is_leaf value for that row.

In [10]:
test_eq(len(parents_df), 4639)

In [11]:
test_eq(root_category_id in parents_df.values, True)
test_eq(root_category_id in categories, False)

In [12]:
parents_df['is_leaf'] = ~parents_df.category.isin(parents_df.parent)
parents_df['roll_up'] = False

In [13]:
def parent(cat):
    if cat in parents_df.category.values:
        return parents_df[parents_df.category == cat ].parent.to_list()[0]
    else:
        return root_category_id

In [14]:
aleaf = parents_df[parents_df.is_leaf].category.values[0] # 'abcat0011001'
test_eq(len(parents_df[parents_df.parent == aleaf]), 0)

expected = 'abcat0011000'
test_eq(parent('abcat0011003'), expected)

In [15]:
# Accepts a seq of tuple
data = L(df.category.value_counts().to_dict().items())
qcount = pd.DataFrame.from_records(data, columns=['cat', 'count'])

cat_counts = df.category.value_counts().to_dict()

# Compute the query counts of all categories from train.csv and 0 if category not present
parent_counts = [cat_counts.get(o['category'], 0) for _, o in parents_df.iterrows()]
parents_df['counts'] = parent_counts

## Pruning: Rollup Categories to Min. # of Queries per Category

For example, if the minimum number of queries is 100 and there are only 99 queries mapped to “Best Buy > Musical Instruments > Guitars > Bass Guitars”, then those queries would be mapped to parent category “Best Buy > Musical Instruments > Guitars”. If the minimum number of queries were higher (e.g., 1,000), then those queries might have to be rolled up to an even broader category, like ““Best Buy > Musical Instruments”.

In [16]:
source = df.copy()

In [17]:
def rollup(df, min_queries=1000):
    print(f"Min # of Queries per Category : {min_queries}")
    cat_count = df.groupby('category').size().to_frame('count')

    print(f"# of Categories before pruning : {len(df.category.value_counts())}")

    # categories to be pruned
    prune_df = cat_count[cat_count['count'] < min_queries];

    while len(prune_df) > 0:
        for cat, count in prune_df.iterrows():
            df.replace(to_replace=cat, value=parent(cat), inplace=True)
        cat_count = df.groupby('category').size().to_frame('count')
        prune_df = cat_count[cat_count['count'] < min_queries]
        print(f"Categories to be pruned : {len(prune_df)}")
    
    print()
    print(f"# of Categories after pruning : {len(df.category.value_counts())}")

In [18]:
df = source
rollup(df, 100)

Min # of Queries per Category : 100
# of Categories before pruning : 1486
Categories to be pruned : 140
Categories to be pruned : 33
Categories to be pruned : 5
Categories to be pruned : 2
Categories to be pruned : 1
Categories to be pruned : 0

# of Categories after pruning : 878


In [26]:
min_queries=1000
df = source

In [27]:
# IMPLEMENTED: Roll up categories to ancestors to satisfy the minimum number of queries per category.
cat_count = df.groupby('category').size().to_frame('count')

print(f"# of Categories before pruning : {len(df.category.value_counts())}")

# categories to be pruned
prune_df = cat_count[cat_count['count'] < min_queries];

while len(prune_df) > 0:
    for cat, count in prune_df.iterrows():
        df.replace(to_replace=cat, value=parent(cat), inplace=True)
    cat_count = df.groupby('category').size().to_frame('count')
    prune_df = cat_count[cat_count['count'] < min_queries]
    print(f"Categories to be pruned : {len(prune_df)}")
    
print()
print(f"# of Categories after pruning : {len(df.category.value_counts())}")

# of Categories before pruning : 1486
Categories to be pruned : 232
Categories to be pruned : 42
Categories to be pruned : 5
Categories to be pruned : 1
Categories to be pruned : 0

# of Categories after pruning : 386


In [29]:
# Create labels in fastText format.
df['label'] = '__label__' + df['category']

# Output labeled query data as a space-separated file, making sure that every category is in the taxonomy.
df = df[df['category'].isin(categories)]
df['output'] = df['label'] + ' ' + df['query']

In [59]:
df[['output']].to_csv(output_file_name, header=False, sep='|', escapechar='\\', quoting=csv.QUOTE_NONE, index=False)

# Train a Classifier (with min Q set to 1000)

In [8]:
def print_res(res, k=1): 
    print(f"N\t: {res[0]} \nP@{k}\t: {res[1]:.3f} \nR@{k}\t: {res[2]:.3f}\n")

In [13]:
# https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
#df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [14]:
shuf_output_fname = '/workspace/datasets/shuf_labeled_query_data.txt'

In [2]:
train_file = '/workspace/datasets/queries.train'
test_file = '/workspace/datasets/queries.test'

In [16]:
output_file_name, shuf_output_fname

('/workspace/datasets/labeled_query_data.txt',
 '/workspace/datasets/shuf_labeled_query_data.txt')

In [17]:
# Shuffle the input before train/test split
!shuf {output_file_name} > {shuf_output_fname}

Set aside first 50000 rows as train and last 10000 rows as test

In [19]:
!head -n 50000 {shuf_output_fname} > {train_file}
!tail -n 50000 {shuf_output_fname} > {test_file}

There may be a problem with the way we are splitting here. Some of the categories in train may not be in test and vice versa. Ideally we should have used stratified train test split to ensure the category distribution is similar in train & test dataset.

In [20]:
model=ft.train_supervised(train_file)

Read 0M words
Number of words:  7793
Number of labels: 385
Progress: 100.0% words/sec/thread:   13485 lr:  0.000000 avg.loss:  4.263705 ETA:   0h 0m 0s


In [21]:
L(model.labels)

(#385) ['__label__cat02015','__label__abcat0101001','__label__pcmcat247400050000','__label__pcmcat209000050008','__label__pcmcat144700050004','__label__pcmcat209400050001','__label__abcat0703002','__label__pcmcat247400050001','__label__abcat0201011','__label__pcmcat209000050007'...]

## Fasttext with default parameters

In [22]:
model.epoch, model.lr, model.wordNgrams

(5, 0.1, 1)

In [23]:
res = model.test(test_file, k=1);print_res(res, k=1); print()
res = model.test(test_file, k=3);print_res(res, k=3); print()
res = model.test(test_file, k=5);print_res(res, k=5); print()

N	: 50000 
P@1	: 0.470 
R@1	: 0.470


N	: 50000 
P@3	: 0.212 
R@3	: 0.636


N	: 50000 
P@5	: 0.139 
R@5	: 0.694




## Fasttext with optimized parameters

In [50]:
model = ft.train_supervised(train_file, epoch=25, lr=0.2, wordNgrams=2)

Read 0M words
Number of words:  7793
Number of labels: 385
Progress: 100.0% words/sec/thread:   13571 lr:  0.000000 avg.loss:  2.055373 ETA:   0h 0m 0s 78.3% words/sec/thread:   13697 lr:  0.043406 avg.loss:  2.285883 ETA:   0h 0m 5sm 2s100.0% words/sec/thread:   13571 lr: -0.000005 avg.loss:  2.055373 ETA:   0h 0m 0s


In [51]:
res = model.test(test_file, k=1);print_res(res, k=1); print()
res = model.test(test_file, k=3);print_res(res, k=3); print()
res = model.test(test_file, k=5);print_res(res, k=5); print()

N	: 50000 
P@1	: 0.519 
R@1	: 0.519


N	: 50000 
P@3	: 0.235 
R@3	: 0.705


N	: 50000 
P@5	: 0.154 
R@5	: 0.770




In [61]:
model_path='/workspace/datasets/fasttext'
model.save_model(f'{model_path}/query_model.bin')

In [13]:
model_path='/workspace/datasets/fasttext'

In [14]:
query_model = ft.load_model(f'{model_path}/query_model.bin')



In [15]:
query_model.predict('satelite radio', k=3)

(('__label__pcmcat139900050002', '__label__abcat0202003', '__label__cat09000'),
 array([0.2879619 , 0.16255598, 0.09963983]))

In [20]:
cats, conf = query_model.predict('blue ray dvr', k=3)

In [21]:
cats, conf

(('__label__abcat0102003',
  '__label__abcat0515004',
  '__label__pcmcat205900050012'),
 array([0.46350497, 0.03990539, 0.03567554]))

In [17]:
query_model.predict('bose headphon', k=3)

(('__label__pcmcat144700050004', '__label__abcat0208011', '__label__cat09000'),
 array([0.76492441, 0.14439672, 0.03154249]))

## Train a classifier with min # of queries set to 100 (default)

In [9]:
import fasttext as ft

shuf_output_fname = '/workspace/datasets/shuf_labeled_query_data_100.txt'
output_file_name = '/workspace/datasets/labeled_query_data_100.txt'

train_file = '/workspace/datasets/queries.train'
test_file = '/workspace/datasets/queries.test'

# Shuffle the input before train/test split
!shuf {output_file_name} > {shuf_output_fname}

# Set aside first 50000 rows as train and last 10000 rows as test

!head -n 50000 {shuf_output_fname} > {train_file}
!tail -n 50000 {shuf_output_fname} > {test_file}

# There may be a problem with the way we are splitting here. Some of the categories in train may not be in test and vice versa. Ideally we should have used stratified train test split to ensure the category distribution is similar in train & test dataset.

model=ft.train_supervised(train_file)

res = model.test(test_file, k=1);print_res(res, k=1); print()
res = model.test(test_file, k=3);print_res(res, k=3); print()
res = model.test(test_file, k=5);print_res(res, k=5); print()

Read 0M words
Number of words:  7738
Number of labels: 874
Progress: 100.0% words/sec/thread:    6539 lr:  0.000000 avg.loss:  5.382691 ETA:   0h 0m 0s ETA:   0h 0m 0s


N	: 49987 
P@1	: 0.461 
R@1	: 0.461


N	: 49987 
P@3	: 0.205 
R@3	: 0.614


N	: 49987 
P@5	: 0.135 
R@5	: 0.676




In [11]:
## Train a classifier with min # of queries set to 100 (optimized)

In [10]:
import fasttext as ft

shuf_output_fname = '/workspace/datasets/shuf_labeled_query_data_100.txt'
output_file_name = '/workspace/datasets/labeled_query_data_100.txt'

train_file = '/workspace/datasets/queries.train'
test_file = '/workspace/datasets/queries.test'

# Shuffle the input before train/test split
!shuf {output_file_name} > {shuf_output_fname}

# Set aside first 50000 rows as train and last 10000 rows as test

!head -n 50000 {shuf_output_fname} > {train_file}
!tail -n 50000 {shuf_output_fname} > {test_file}

# There may be a problem with the way we are splitting here. Some of the categories in train may not be in test and vice versa. Ideally we should have used stratified train test split to ensure the category distribution is similar in train & test dataset.

model=ft.train_supervised(train_file, epoch=25, lr=0.2, wordNgrams=2)

res = model.test(test_file, k=1);print_res(res, k=1); print()
res = model.test(test_file, k=3);print_res(res, k=3); print()
res = model.test(test_file, k=5);print_res(res, k=5); print()

Read 0M words
Number of words:  7727
Number of labels: 871
Progress:  99.9% words/sec/thread:    6488 lr:  0.000107 avg.loss:  2.313887 ETA:   0h 0m 0s 59.8% words/sec/thread:    6491 lr:  0.080374 avg.loss:  2.917681 ETA:   0h 0m19s

N	: 49978 
P@1	: 0.511 
R@1	: 0.511


N	: 49978 
P@3	: 0.232 
R@3	: 0.695


N	: 49978 
P@5	: 0.151 
R@5	: 0.755




Progress: 100.0% words/sec/thread:    6478 lr:  0.000000 avg.loss:  2.313320 ETA:   0h 0m 0s


# Manual Query Classificaition

In [52]:
! head {test_file}

__label__cat09000 satelite radio
__label__abcat0901000 wine cool
__label__pcmcat144700050004 drake beat
__label__cat02607 music
__label__abcat0208024 chauvet
__label__abcat0106016 elit
__label__abcat0102007 blue ray dvr
__label__abcat0410010 canon camera flash
__label__cat02015 lion k
__label__pcmcat144700050004 bose headphon


In [3]:
!tail {train_file}

__label__abcat0201010 waterproof
__label__cat02015 lion k
__label__pcmcat248700050021 radio
__label__pcmcat183800050007 labtop pow
__label__pcmcat158900050018 projector
__label__pcmcat128500050004 speaker stand
__label__pcmcat209000050008 toshiba thr
__label__cat02015 thor
__label__abcat0101001 vizio
__label__cat02004 turntabl


In [1]:
#!python ../opensearch/categoryViewer.py

In [4]:
' __label__cat09000 '.strip()[len('__label__'):]

'cat09000'