In [3]:
import argparse
import os
import random
import re
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd
import numpy as np


In [4]:
directory = r'/workspace/search_with_machine_learning_course/data/pruned_products/'

In [5]:
def transform_name(product_name):
    product_name = product_name.lower()
    product_name = re.sub(r'[^\w]', ' ', product_name)
    return product_name

In [6]:
pairs = []
for filename in os.listdir(directory):
        if filename.endswith(".xml"):
            print("Processing %s" % filename)
            f = os.path.join(directory, filename)
            tree = ET.parse(f)
            root = tree.getroot()
            for child in root:
                if random.random() > 1.0:
                    continue
                # Check to make sure category name is valid
                if (child.find('name') is not None and child.find('name').text is not None and
                    child.find('categoryPath') is not None and len(child.find('categoryPath')) > 0 and
                    child.find('categoryPath')[len(child.find('categoryPath')) - 1][0].text is not None):
                      # Choose last element in categoryPath as the leaf categoryId
                      cat = child.find('categoryPath')[len(child.find('categoryPath')) - 1][0].text
                      # Replace newline chars with spaces so fastText doesn't complain
                      name = child.find('name').text.replace('\n', ' ')
                      #output.write("__label__%s %s\n" % (cat, transform_name(name)))
                      pair = {"category": cat, "name": "__label__%s %s\n" % (cat, transform_name(name))}
                      pairs.append(pair)

Processing pruned_products_1.xml
Processing pruned_products_2.xml
Processing pruned_products_3.xml
Processing pruned_products_4.xml
Processing pruned_products_5.xml
Processing pruned_products_6.xml


In [7]:
df = pd.DataFrame(pairs)

In [8]:
df['category'].unique().size

1952

In [9]:
# 1952 unique categories. yuck

sample = df.sample(frac = 0.05)


In [10]:
train = sample.head(5000)
test = sample.tail(5000)

In [11]:
train.head()

Unnamed: 0,category,name
64909,pcmcat152100050032,__label__pcmcat152100050032 american dj chas...
51916,pcmcat247400050000,__label__pcmcat247400050000 dell 14 inspiro...
103171,abcat0916002,__label__abcat0916002 smart choice 30 amp 3 ...
36460,pcmcat147400050016,__label__pcmcat147400050016 dr daisy mac wi...
30101,abcat0701003,__label__abcat0701003 halo wars platinum hits ...


In [14]:
 import fasttext

In [15]:
np.savetxt(r"/workspace/datasets/fasttext/products.train", train.values, fmt='%s')
np.savetxt(r"/workspace/datasets/fasttext/products.test", test.values, fmt='%s')

In [16]:
model = fasttext.train_supervised(input="/workspace/datasets/fasttext/products.train")

Read 0M words
Number of words:  7248
Number of labels: 1103
Progress: 100.0% words/sec/thread:   16080 lr:  0.000000 avg.loss: 16.653025 ETA:   0h 0m 0s100.0% words/sec/thread:   16082 lr: -0.000030 avg.loss: 16.653025 ETA:   0h 0m 0s


In [17]:
model.predict("apple ipad 2 16 GB")

(('__label__abcat0101001',), array([0.10049994]))

In [18]:
model.test("/workspace/datasets/fasttext/products.test")

(4956, 0.01916868442292171, 0.01916868442292171)

In [19]:
model = fasttext.train_supervised(input="/workspace/datasets/fasttext/products.train", lr=1.0, epoch=25, wordNgrams=2)

Read 0M words
Number of words:  7248
Number of labels: 1103
Progress: 100.0% words/sec/thread:   16696 lr:  0.000000 avg.loss:  1.910166 ETA:   0h 0m 0s 46.6% words/sec/thread:   17052 lr:  0.533974 avg.loss:  4.013265 ETA:   0h 0m 3s


In [20]:
model.test("/workspace/datasets/fasttext/products.test")

(4956, 0.9533898305084746, 0.9533898305084746)

In [21]:
model.save_model("/workspace/search_with_machine_learning_course/model_products.bin")