In [6]:
import argparse
import os
import random
import re
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd
import numpy as np


In [7]:
directory = r'/workspace/search_with_machine_learning_course/data/pruned_products/'

In [8]:
def transform_name(product_name):
    product_name = product_name.lower()
    re.sub(r'[^\w]', ' ', product_name)
    return product_name

In [9]:
pairs = []
for filename in os.listdir(directory):
        if filename.endswith(".xml"):
            print("Processing %s" % filename)
            f = os.path.join(directory, filename)
            tree = ET.parse(f)
            root = tree.getroot()
            for child in root:
                if random.random() > 1.0:
                    continue
                # Check to make sure category name is valid
                if (child.find('name') is not None and child.find('name').text is not None and
                    child.find('categoryPath') is not None and len(child.find('categoryPath')) > 0 and
                    child.find('categoryPath')[len(child.find('categoryPath')) - 1][0].text is not None):
                      # Choose last element in categoryPath as the leaf categoryId
                      cat = child.find('categoryPath')[len(child.find('categoryPath')) - 1][0].text
                      # Replace newline chars with spaces so fastText doesn't complain
                      name = child.find('name').text.replace('\n', ' ')
                      #output.write("__label__%s %s\n" % (cat, transform_name(name)))
                      pair = {"category": cat, "name": "__label__%s %s\n" % (cat, transform_name(name))}
                      pairs.append(pair)

Processing pruned_products_1.xml
Processing pruned_products_2.xml
Processing pruned_products_3.xml
Processing pruned_products_4.xml
Processing pruned_products_5.xml
Processing pruned_products_6.xml


In [10]:
df = pd.DataFrame(pairs)

In [11]:
df['category'].unique().size

1952

In [12]:
# 1952 unique categories. yuck

sample = df.sample(frac = 0.05)


In [16]:
train = sample.head(5000)
test = sample.tail(5000)

In [20]:
train.head()

Unnamed: 0,category,name
89691,abcat0912025,__label__abcat0912025 simplehuman - size m sur...
37042,abcat0910004,__label__abcat0910004 ge - 7.0 cu. ft. super c...
97731,pcmcat226200050005,__label__pcmcat226200050005 tribeca - boston c...
107128,pcmcat250300050009,__label__pcmcat250300050009 huffy - granite 24...
110620,pcmcat161500050000,__label__pcmcat161500050000 samsung - refurbis...


In [21]:
 import fasttext

In [22]:
np.savetxt(r"/workspace/datasets/fasttext/products.train", train.values, fmt='%s')
np.savetxt(r"/workspace/datasets/fasttext/products.test", test.values, fmt='%s')

In [23]:
model = fasttext.train_supervised(input="/workspace/datasets/fasttext/products.train")

Read 0M words
Number of words:  8625
Number of labels: 1109
Progress: 100.0% words/sec/thread:   16344 lr:  0.000000 avg.loss: 15.361784 ETA:   0h 0m 0s


In [24]:
model.predict("apple ipad 2 16 GB")

(('__label__abcat0101001',), array([0.00784811]))

In [25]:
model.test("/workspace/datasets/fasttext/products.test")

(4940, 0.015587044534412956, 0.015587044534412956)

In [26]:
model = fasttext.train_supervised(input="/workspace/datasets/fasttext/products.train", lr=1.0, epoch=25, wordNgrams=2)

Read 0M words
Number of words:  8625
Number of labels: 1109
Progress: 100.0% words/sec/thread:   17265 lr:  0.000000 avg.loss:  2.049963 ETA:   0h 0m 0s


In [27]:
model.test("/workspace/datasets/fasttext/products.test")

(4940, 0.952834008097166, 0.952834008097166)

In [30]:
model.save_model("/workspace/search_with_machine_learning_course/model_products.bin")