In [2]:
import os
import argparse
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv
import fasttext

In [11]:
# Useful if you want to perform stemming.
import nltk
stemmer = nltk.stem.PorterStemmer()

categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'

queries_file_name = r'/workspace/datasets/train.csv'
output_file_name = r'/workspace/datasets/labeled_query_data.txt'


In [12]:
min_queries = 1

In [13]:
# The root category, named Best Buy with id cat00000, doesn't have a parent.
root_category_id = 'cat00000'


In [14]:
tree = ET.parse(categories_file_name)
root = tree.getroot()


In [15]:
# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    print(id)
    cat_path = child.find('path')
    print(cat_path)
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    print(cat_path_ids)
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])
parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])


abcat0010000
<Element 'path' at 0x7f4a632b46d0>
['cat00000', 'abcat0010000']
abcat0011000
<Element 'path' at 0x7f4a3798da40>
['cat00000', 'abcat0010000', 'abcat0011000']
abcat0011001
<Element 'path' at 0x7f4a379144a0>
['cat00000', 'abcat0010000', 'abcat0011000', 'abcat0011001']
abcat0011002
<Element 'path' at 0x7f4a37914f40>
['cat00000', 'abcat0010000', 'abcat0011000', 'abcat0011002']
abcat0011003
<Element 'path' at 0x7f4a37919180>
['cat00000', 'abcat0010000', 'abcat0011000', 'abcat0011003']
abcat0011004
<Element 'path' at 0x7f4a37919720>
['cat00000', 'abcat0010000', 'abcat0011000', 'abcat0011004']
abcat0012000
<Element 'path' at 0x7f4a37919f40>
['cat00000', 'abcat0010000', 'abcat0012000']
abcat0012001
<Element 'path' at 0x7f4a379173b0>
['cat00000', 'abcat0010000', 'abcat0012000', 'abcat0012001']
abcat0012002
<Element 'path' at 0x7f4a37917900>
['cat00000', 'abcat0010000', 'abcat0012000', 'abcat0012002']
abcat0012003
<Element 'path' at 0x7f4a37917e50>
['cat00000', 'abcat0010000', 'abcat

In [17]:
parent_mapper = parents_df.set_index('category').parent.to_dict()

In [34]:
# Read the training data into pandas, only keeping queries with non-root categories in our category tree.
df = pd.read_csv(queries_file_name)[['category', 'query']]
df = df[df['category'].isin(categories)]

In [36]:
df.category.value_counts()

cat02015              177638
abcat0101001           80213
pcmcat247400050000     79245
pcmcat209000050008     74258
pcmcat144700050004     43991
                       ...  
pcmcat230600050054         1
pcmcat230600050036         1
pcmcat221400050012         1
pcmcat254000050002         1
pcmcat221400050013         1
Name: category, Length: 1486, dtype: int64

In [19]:
def normalize_text(text):
    # convert to lower case
    text = text.lower()

    #get the stemm part
    text = ' '.join([stemmer.stem(word) for word in text.split(' ')])
    
    return text

normalize_text('Televisiones Panasonic 50 pulgadas')

'television panason 50 pulgada'

In [20]:

    from collections import Counter
    categories_list=[]
    for category in df.category:
        categories_list.append(category)
        while category != 'cat00000':
            parent = parent_mapper[category]
            categories_list.append(parent)
            category = parent
    
    category_counter = dict(Counter(categories_list))

In [21]:
assert category_counter['cat00000'] == len(df)

In [22]:
def replace_category(category):
    cat_count = category_counter[category]
    while cat_count < min_queries and category != 'cat00000':
        category = parent_mapper[category]
        cat_count = category_counter[category]
    return category

def replace_with_parent(category, target_category):
    if category in target_category:
        return parent_mapper[category]
    else:
        return category

In [23]:
min_queries=2

In [24]:
from functools import partial

category_ranking = df[df.category!='cat00000'].category.value_counts()

categories_to_change = category_ranking[category_ranking < min_queries].index.to_list()
while len(categories_to_change) > 0:
    print('replace'+min_category_name)
    df.loc[:, 'category'] = df.category.apply(  partial(replace_with_parent, 
                                                target_category=categories_to_change)
                                            )

    category_ranking = df[df.category!='cat00000'].category.value_counts()
    categories_to_change = category_ranking[category_ranking < min_queries].index.to_list()
    

NameError: name 'min_category_name' is not defined

In [33]:
parent_mapper['abcat0100000']

'cat00000'

In [27]:
df.category.value_counts()


cat02015              177638
abcat0101001           80213
pcmcat247400050000     79245
pcmcat209000050008     74258
pcmcat144700050004     43991
                       ...  
pcmcat230600050054         1
pcmcat230600050036         1
pcmcat221400050012         1
pcmcat254000050002         1
pcmcat221400050013         1
Name: category, Length: 1486, dtype: int64

In [None]:
df.category.apply(replace_category).value_counts().index[-1]

'pcmcat153600050007'

In [None]:
import fasttext

In [3]:
model = fasttext.load_model('/workspace/datasets/week3/modelfastext_1m_proc25.bin')



In [15]:
model.predict('ball')

(('__label__cat02015',), array([0.93052971]))