# Level 1: Query Classification

In [1]:
from fastcore.foundation import L
from fastcore.basics import *
from fastcore.test import *
from nbdev.showdoc import show_doc

In [2]:
import re
import string
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language='english')
_re_spec = re.compile(r'([/#\\-\\.:\'\"])')

def spec_add_spaces(t):
    "Add spaces around \" ' . : - / \ and #"
    return _re_spec.sub(r' \1 ', t)

# Causes the resulting RE to match from m to n repetitions of the preceding RE, attempting to match as many repetitions as possible.
_re_space = re.compile(' {2,}')

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return _re_space.sub(' ', t)

In [3]:
test_eq(spec_add_spaces(".nltk"), ' . nltk')
test_eq(spec_add_spaces("nltk:"), 'nltk : ')
test_eq(spec_add_spaces("nltk'"), "nltk ' ")
test_eq(spec_add_spaces("nltk\""), 'nltk " ')

# Prune the Category Taxonomy

## Transform Queries

Convert the queries to lowercase, strip quotation marks (and perhaps other punctuation), and optionally implement other normalization, like using the nltk stemmer.

In [4]:
def rm_punct(t):
    for p in string.punctuation:
        t = t.replace(p, ' ')
    return t

def transform(query):
    "Transform query by replacing punctuations, removing multiple spaces, stemming, lower casing"
    
    # replace_punct
    query = spec_add_spaces(query)
    
    # remove punct
    query = rm_punct(query)
    
    # replace multiple spaces
    query = rm_useless_spaces(query)
    
    # fix registered, trademark, copyright symbol
    # remove non-ascii characters from query
    query = query.encode(encoding='ascii', errors='ignore').decode()
    
    query = ' '.join([o for o in query.split(' ') if not o.isnumeric()])
    
    # add stemmer
    query = stemmer.stem(query)
    return query.lower()

In [6]:
import os
import argparse
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv

# Useful if you want to perform stemming.
import nltk
stemmer = nltk.stem.PorterStemmer()

categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'

queries_file_name = r'/workspace/datasets/train.csv'
output_file_name = r'/workspace/datasets/labeled_query_data.txt'

# parser = argparse.ArgumentParser(description='Process arguments.')
# general = parser.add_argument_group("general")
# general.add_argument("--min_queries", default=1,  help="The minimum number of queries per category label (default is 1)")
# general.add_argument("--output", default=output_file_name, help="the file to output to")

# args = parser.parse_args()
# output_file_name = args.output

# if args.min_queries:
#     min_queries = int(args.min_queries)
# The root category, named Best Buy with id cat00000, doesn't have a parent.
root_category_id = 'cat00000'

Read the category tree from the categories XML file.

In [7]:
tree = ET.parse(categories_file_name)
root = tree.getroot()

# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])

In [8]:
# Read the training data into pandas, only keeping queries with non-root categories in our category tree.
df = pd.read_csv(queries_file_name)[['category', 'query']]
df = df[df['category'].isin(categories)]

In [9]:
# IMPLEMENTED: Convert queries to lowercase, and optionally implement other normalization, like stemming.
df['query'] = df['query'].apply(transform)

## Query Count of Leaf Categories

Compute the query count of all leaf categories.

What is leaf? 
A category that is not a parent. So it cannot exist in the parent column. For each category check if exists in parent column, then update is_leaf value for that row.

In [10]:
test_eq(len(parents_df), 4639)

In [11]:
test_eq(root_category_id in parents_df.values, True)
test_eq(root_category_id in categories, False)

In [16]:
parents_df['is_leaf'] = ~parents_df.category.isin(parents_df.parent)
parents_df['roll_up'] = False

In [22]:
def parent(cat):
    if cat in parents_df.category.values:
        return parents_df[parents_df.category == cat ].parent.to_list()[0]
    else:
        return root_category_id

In [23]:
aleaf = parents_df[parents_df.is_leaf].category.values[0] # 'abcat0011001'
test_eq(len(parents_df[parents_df.parent == aleaf]), 0)

expected = 'abcat0011000'
test_eq(parent('abcat0011003'), expected)

In [24]:
# Accepts a seq of tuple
data = L(df.category.value_counts().to_dict().items())
qcount = pd.DataFrame.from_records(data, columns=['cat', 'count'])

cat_counts = df.category.value_counts().to_dict()

# Compute the query counts of all categories from train.csv and 0 if category not present
parent_counts = [cat_counts.get(o['category'], 0) for _, o in parents_df.iterrows()]
parents_df['counts'] = parent_counts

## Pruning: Rollup Categories to Min. # of Queries per Category

For example, if the minimum number of queries is 100 and there are only 99 queries mapped to “Best Buy > Musical Instruments > Guitars > Bass Guitars”, then those queries would be mapped to parent category “Best Buy > Musical Instruments > Guitars”. If the minimum number of queries were higher (e.g., 1,000), then those queries might have to be rolled up to an even broader category, like ““Best Buy > Musical Instruments”.

In [25]:
source = df.copy()

In [26]:
min_queries=1000

In [27]:
# IMPLEMENTED: Roll up categories to ancestors to satisfy the minimum number of queries per category.
cat_count = df.groupby('category').size().to_frame('count')

print(f"# of Categories before pruning : {len(df.category.value_counts())}")

# categories to be pruned
prune_df = cat_count[cat_count['count'] < min_queries];

while len(prune_df) > 0:
    for cat, count in prune_df.iterrows():
        df.replace(to_replace=cat, value=parent(cat), inplace=True)
    cat_count = df.groupby('category').size().to_frame('count')
    prune_df = cat_count[cat_count['count'] < min_queries]
    print(f"Categories to be pruned : {len(prune_df)}")
    
print()
print(f"# of Categories after pruning : {len(df.category.value_counts())}")

# of Categories before pruning : 1486
Categories to be pruned : 232
Categories to be pruned : 42
Categories to be pruned : 5
Categories to be pruned : 1
Categories to be pruned : 0

# of Categories after pruning : 386


In [29]:
# Create labels in fastText format.
df['label'] = '__label__' + df['category']

# Output labeled query data as a space-separated file, making sure that every category is in the taxonomy.
df = df[df['category'].isin(categories)]
df['output'] = df['label'] + ' ' + df['query']

In [31]:
df[['output']].to_csv(output_file_name, header=False, sep='|', escapechar='\\', quoting=csv.QUOTE_NONE, index=False)

# fin