In [29]:
import argparse
import os
import random
import re
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd
import numpy as np


In [2]:
directory = r'/workspace/search_with_machine_learning_course/data/pruned_products/'

In [3]:
def transform_name(product_name):
    product_name = product_name.lower()
    re.sub(r'[^\w]', ' ', product_name)
    return product_name

In [30]:
pairs = []
for filename in os.listdir(directory):
        if filename.endswith(".xml"):
            print("Processing %s" % filename)
            f = os.path.join(directory, filename)
            tree = ET.parse(f)
            root = tree.getroot()
            for child in root:
                if random.random() > 1.0:
                    continue
                # Check to make sure category name is valid
                if (child.find('name') is not None and child.find('name').text is not None and
                    child.find('categoryPath') is not None and len(child.find('categoryPath')) > 0 and
                    child.find('categoryPath')[len(child.find('categoryPath')) - 1][0].text is not None):
                      # Choose last element in categoryPath as the leaf categoryId
                      cat = child.find('categoryPath')[len(child.find('categoryPath')) - 1][0].text
                      # Replace newline chars with spaces so fastText doesn't complain
                      name = child.find('name').text.replace('\n', ' ')
                      #output.write("__label__%s %s\n" % (cat, transform_name(name)))
                      pair = {"category": cat, "name": "__label__%s %s\n" % (cat, transform_name(name))}
                      pairs.append(pair)

Processing pruned_products_1.xml
Processing pruned_products_2.xml
Processing pruned_products_3.xml
Processing pruned_products_4.xml
Processing pruned_products_5.xml
Processing pruned_products_6.xml


In [31]:
df = pd.DataFrame(pairs)

In [32]:
df['category'].unique().size

1952

In [46]:
# 1952 unique categories. yuck

train = df.sample(frac = 0.05)
test = df.sample(frac = 0.05)

In [34]:
train.size

11536

In [47]:
train.head()

Unnamed: 0,category,name
14665,pcmcat226200050022,__label__pcmcat226200050022 wild sales - los a...
77430,pcmcat180400050006,__label__pcmcat180400050006 canon eos 60d 18.0...
109698,abcat0705005,__label__abcat0705005 payout poker & casino - ...
71719,abcat0905001,"__label__abcat0905001 lg - kitchen series 24"" ..."
29874,abcat0807004,__label__abcat0807004 epson - claria high-capa...


In [36]:
 import fasttext

In [48]:
np.savetxt(r"/workspace/datasets/fasttext/products.train", train.values, fmt='%s')
np.savetxt(r"/workspace/datasets/fasttext/products.test", test.values, fmt='%s')

In [49]:
model = fasttext.train_supervised(input="/workspace/datasets/fasttext/products.train")

Read 0M words
Number of words:  9241
Number of labels: 1154
Progress: 100.0% words/sec/thread:   16984 lr:  0.000000 avg.loss: 14.700533 ETA:   0h 0m 0s


In [50]:
model.predict("apple ipad 2 16 GB")

(('__label__pcmcat180400050006',), array([0.00898728]))

In [51]:
model.test("/workspace/datasets/fasttext/products.test")

(5368, 0.05290611028315946, 0.05290611028315946)

In [53]:
model = fasttext.train_supervised(input="/workspace/datasets/fasttext/products.train", lr=1.0, epoch=25, wordNgrams=2)

Read 0M words
Number of words:  9241
Number of labels: 1154
Progress: 100.0% words/sec/thread:   16872 lr:  0.000000 avg.loss:  1.851670 ETA:   0h 0m 0s100.0% words/sec/thread:   16872 lr: -0.000219 avg.loss:  1.851670 ETA:   0h 0m 0s


In [54]:
model.test("/workspace/datasets/fasttext/products.test")

(5368, 0.7326751117734724, 0.7326751117734724)

In [55]:
model.save_model("/workspace/search_with_machine_learning_course/model_products.bin")