# Classifying Product Names into Cateogories

## 0. Setup

In [1]:
from fastcore.foundation import L
from fastcore.test import *

In [2]:
import fasttext as ft

Data Prerequisite:
    `gunzip /workspace/search_with_machine_learning_course/data/*/*.xml.gz`

In [3]:
def print_res(res, k=1): 
    print(f"N\t: {res[0]} \nP@{k}\t: {res[1]:.3f} \nR@{k}\t: {res[2]:.3f}\n")

In [38]:
import re
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language='english')
_re_spec = re.compile(r'([/#\\-\\.:])')

def spec_add_spaces(t):
    "Add spaces around . : - / \ and #"
    return _re_spec.sub(r' \1 ', t)

# Causes the resulting RE to match from m to n repetitions of the preceding RE, attempting to match as many repetitions as possible.
_re_space = re.compile(' {2,}')

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return _re_space.sub(' ', t)

def transform_name(product_name):
    "Transform product name by replacing punctuations, removing multiple spaces, stemming, lower casing"
    name = product_name

    # replace_punct
    name = spec_add_spaces(name)
    # replace multiple spaces
    name = rm_useless_spaces(name)

    # add stemmer
    name = stemmer.stem(name)
    return name.lower()

## 1. Sample Rate 0.1

In [1]:
# Process the pruned_products (10% of data) to generate training data
!python createContentTrainingData.py --sample_rate 0.1

Output is as follows
```
Writing results to /workspace/datasets/fasttext/output.fasttext
Processing pruned_products_1.xml
Processing pruned_products_2.xml
Processing pruned_products_3.xml
Processing pruned_products_4.xml
Processing pruned_products_5.xml
Processing pruned_products_6.xml
```

In [4]:
!head /workspace/datasets/fasttext/output.fasttext

__label__pcmcat237000050016 DeLorme - inReach 2-Way Satellite Communicator for DeLorme Earthmate PN-60w GPS
__label__pcmcat193100050014 Sungale - Beam E-Reader - White
__label__pcmcat186400050002 Olympus - X-560WP 10.0-Megapixel Digital Camera - Red
__label__pcmcat258900050010 Jura - ENA 9 One Touch Cappuccino and Latte Macchiato Maker - Silver
__label__pcmcat258900050010 Jura - Impressa J9 Cappuccino, Latte Macchiato and Café Latte Maker - Silver
__label__pcmcat258900050007 Capresso - 10-Cup Coffeemaker - Black/Silver
__label__pcmcat174700050005 Mystery Masterpiece: The Moonstone - Windows
__label__pcmcat174700050005 Kitchen Brigade - Windows
__label__pcmcat158900050018 3M - Mobile SVGA LCOS Projector
__label__cat09000 Best Buy GC - $200 Techno Twinkle Gift Card


We can confirm the generated output is in the format suitable for fasttext`__label__<label> <product_name>`.

In [None]:
# Shuffle the input before train/test split
!shuf /workspace/datasets/fasttext/output.fasttext > /workspace/datasets/fasttext/shuf_output.fasttext

In [3]:
!wc -l /workspace/datasets/fasttext/shuf_output.fasttext

11764 /workspace/datasets/fasttext/shuf_output.fasttext


In [4]:
!head -n 10000 /workspace/datasets/fasttext/shuf_output.fasttext > /workspace/datasets/fasttext/products.train
!tail -n 1764 /workspace/datasets/fasttext/shuf_output.fasttext > /workspace/datasets/fasttext/products.test

In [4]:
train_file = '/workspace/datasets/fasttext/products.train'
test_file = '/workspace/datasets/fasttext/products.test'

In [41]:
ft.train_supervised?

[0;31mSignature:[0m [0mft[0m[0;34m.[0m[0mtrain_supervised[0m[0;34m([0m[0;34m*[0m[0mkargs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Train a supervised model and return a model object.

input must be a filepath. The input text does not need to be tokenized
as per the tokenize function, but it must be preprocessed and encoded
as UTF-8. You might want to consult standard preprocessing scripts such
as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html

The input file must must contain at least one label per line. For an
example consult the example datasets which are part of the fastText
repository such as the dataset pulled by classification-example.sh.
[0;31mFile:[0m      ~/.pyenv/versions/search_with_ml_week3/lib/python3.9/site-packages/fasttext/FastText.py
[0;31mType:[0m      function


In [6]:
model = ft.train_supervised(train_file)

Read 0M words
Number of words:  11080
Number of labels: 1356
Progress: 100.0% words/sec/thread:   12354 lr:  0.000000 avg.loss: 13.697918 ETA:   0h 0m 0s


In [11]:
#print(model.words)
#print(model.labels)
L(model.words), L(model.labels)

((#11080) ['-','</s>','for','Black','with','and','Digital','Memory','Case','/'...],
 (#1356) ['__label__abcat0101001','__label__pcmcat180400050006','__label__abcat0401004','__label__pcmcat247400050000','__label__cat09000','__label__abcat0901005','__label__abcat0905001','__label__pcmcat171900050029','__label__abcat0515028','__label__pcmcat151600050006'...])

There are 11080 words present from this 10% of data and around 1356 categories present.

In [12]:
model.predict('GE - 11.9 Cu. Ft. Top-Mount Refrigerator - Bisque-on-Bisque')

(('__label__abcat0901005',), array([0.35519338]))

In [13]:
model.test?

[0;31mSignature:[0m [0mmodel[0m[0;34m.[0m[0mtest[0m[0;34m([0m[0mpath[0m[0;34m,[0m [0mk[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0mthreshold[0m[0;34m=[0m[0;36m0.0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Evaluate supervised model using file given by path
[0;31mFile:[0m      ~/.pyenv/versions/search_with_ml_week3/lib/python3.9/site-packages/fasttext/FastText.py
[0;31mType:[0m      method


In [25]:
res = model.test(test_file, k=1);print_res(res)

N	: 1719 
P@1	: 0.147 
R@1	: 0.147


In [26]:
res = model.test(test_file, k=5);print_res(res)

N	: 1719 
P@1	: 0.047 
R@1	: 0.237


In [27]:
res = model.test(test_file, k=10);print_res(res)

N	: 1719 
P@1	: 0.028 
R@1	: 0.276


As we can see the recall is increasing as we increased but precision is dropping.

### Increase epochs

In [44]:
# Reference: https://fasttext.cc/docs/en/options.html
model=ft.train_supervised(train_file, epoch=25)

Read 0M words
Number of words:  11080
Number of labels: 1356
Progress: 100.0% words/sec/thread:   12534 lr:  0.000000 avg.loss:  6.282897 ETA:   0h 0m 0s


In [45]:
test_eq(model.epoch, 25) #uses the epoch specified

In [48]:
[print_res(model.test(test_file, k)) for k in [1, 5, 10]]

N	: 1719 
P@1	: 0.450 
R@1	: 0.450

N	: 1719 
P@1	: 0.131 
R@1	: 0.657

N	: 1719 
P@1	: 0.072 
R@1	: 0.721



[None, None, None]

### Increase lr 

In [50]:
# Default lr
test_eq(model.lr, 0.1)

In [52]:
model=ft.train_supervised(train_file, lr=0.3)

Read 0M words
Number of words:  11080
Number of labels: 1356
Progress: 100.0% words/sec/thread:   12473 lr:  0.000000 avg.loss:  8.922214 ETA:   0h 0m 0s


In [54]:
test_eq(model.epoch, 5) #Default epoch
test_eq(model.lr, 0.3) # specified lr

In [55]:
[print_res(model.test(test_file, k)) for k in [1, 5, 10]]

N	: 1719 
P@1	: 0.350 
R@1	: 0.350

N	: 1719 
P@1	: 0.106 
R@1	: 0.532

N	: 1719 
P@1	: 0.059 
R@1	: 0.585



[None, None, None]

In [56]:
model=ft.train_supervised(train_file, lr=1.0)

Read 0M words
Number of words:  11080
Number of labels: 1356
Progress: 100.0% words/sec/thread:   12777 lr:  0.000000 avg.loss:  4.663769 ETA:   0h 0m 0s


In [57]:
test_eq(model.lr, 1.0)

In [58]:
[print_res(model.test(test_file, k)) for k in [1, 5, 10]]

N	: 1719 
P@1	: 0.581 
R@1	: 0.581

N	: 1719 
P@1	: 0.155 
R@1	: 0.776

N	: 1719 
P@1	: 0.082 
R@1	: 0.821



[None, None, None]

### Introducing bigrams with wordNgrams

In [59]:
model = ft.train_supervised(train_file, wordNgrams=2)

Read 0M words
Number of words:  11080
Number of labels: 1356
Progress: 100.0% words/sec/thread:   12354 lr:  0.000000 avg.loss: 14.612494 ETA:   0h 0m 0s100.1% words/sec/thread:   12355 lr: -0.000056 avg.loss: 14.612494 ETA:   0h 0m 0s


In [60]:
test_eq(model.wordNgrams, 2)

In [61]:
[print_res(model.test(test_file, k)) for k in [1, 5, 10]]

N	: 1719 
P@1	: 0.056 
R@1	: 0.056

N	: 1719 
P@1	: 0.022 
R@1	: 0.108

N	: 1719 
P@1	: 0.014 
R@1	: 0.144



[None, None, None]

### Process name_strings

In [5]:
with open(train_file) as f: train_data = f.readlines()

In [106]:
with open(test_file) as f: test_data = f.readlines()

In [88]:
import re
#_re_spec = re.compile(r'([.\\!?,\'/\(\)])')
_re_spec = re.compile(r'([/#\\-\\.:])')

def spec_add_spaces(t):
    #"Add spaces around \!?,'/()"
    "Add spaces around : - / \ and #"
    return _re_spec.sub(r' \1 ', t)

# Causes the resulting RE to match from m to n repetitions of the preceding RE, attempting to match as many repetitions as possible.
_re_space = re.compile(' {2,}')

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return _re_space.sub(' ', t)

In [89]:
test_eq(spec_add_spaces(".nltk"), ' . nltk')
test_eq(spec_add_spaces("nltk:"), 'nltk : ')

In [90]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language='english')

In [91]:
stemmer.stem('star wars: the new droid army - game boy advance\n')

'star wars: the new droid army - game boy advance\n'

In [96]:
def process_name(x:str):
    tmp = x.split(' ')
    cat, name = tmp[0], ' '.join(tmp[1:])
    
    # replace_punct
    name = spec_add_spaces(name)
    # replace multiple spaces
    name = rm_useless_spaces(name)
    
    ## add stemmer
    name = stemmer.stem(name)
    return f"{cat} {name.lower()}"

In [97]:
process_name('__label__pcmcat196500050012 Sigma - 120-300mm f/22-2.8 Zoom Lens for Nikon DSLR Cameras')

'__label__pcmcat196500050012 sigma - 120-300mm f / 22-2 . 8 zoom lens for nikon dslr camera'

In [98]:
processed_train_data = L(train_data).map(process_name)

In [110]:
processed_test_data = L(test_data).map(process_name)

In [100]:
train_file, test_file

('/workspace/datasets/fasttext/products.train',
 '/workspace/datasets/fasttext/products.test')

In [104]:
with open('/workspace/datasets/fasttext/products_proc.train', 'w') as f: f.writelines(list(processed_train_data))

In [111]:
with open('/workspace/datasets/fasttext/products_proc.test', 'w') as f: f.writelines(list(processed_test_data))

#### Train with default fasttext params after tranform_name

In [114]:
train_file = '/workspace/datasets/fasttext/products_proc.train'
test_file = '/workspace/datasets/fasttext/products_proc.test'

In [115]:
model = ft.train_supervised(train_file)

Read 0M words
Number of words:  9989
Number of labels: 1356
Progress: 100.0% words/sec/thread:   13529 lr:  0.000000 avg.loss: 13.109398 ETA:   0h 0m 0s


In [118]:
[print_res(model.test(test_file, k)) for k in [1, 5, 10]]

N	: 1719 
P@1	: 0.138 
R@1	: 0.138

N	: 1719 
P@1	: 0.051 
R@1	: 0.255

N	: 1719 
P@1	: 0.029 
R@1	: 0.289



[None, None, None]

#### Train with optimized fasttext params after tranform_name

In [119]:
model = ft.train_supervised(train_file, epoch=25, lr=1.0, wordNgrams=2)

Read 0M words
Number of words:  9989
Number of labels: 1356
Progress: 100.0% words/sec/thread:   12647 lr:  0.000000 avg.loss:  1.256690 ETA:   0h 0m 0s


In [120]:
[print_res(model.test(test_file, k)) for k in [1, 5, 10]]

N	: 1719 
P@1	: 0.618 
R@1	: 0.618

N	: 1719 
P@1	: 0.161 
R@1	: 0.805

N	: 1719 
P@1	: 0.085 
R@1	: 0.852



[None, None, None]

Precision jumped from 0.138 to 0.618

## 2. Enforce min N of products per category (Sample Rate 1.0)

In [4]:
import os
import random
import xml.etree.ElementTree as ET
from pathlib import Path

In [38]:
import re
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language='english')
_re_spec = re.compile(r'([/#\\-\\.:])')

def spec_add_spaces(t):
    "Add spaces around . : - / \ and #"
    return _re_spec.sub(r' \1 ', t)

# Causes the resulting RE to match from m to n repetitions of the preceding RE, attempting to match as many repetitions as possible.
_re_space = re.compile(' {2,}')

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return _re_space.sub(' ', t)

def transform_name(product_name):
    "Transform product name by replacing punctuations, removing multiple spaces, stemming, lower casing"
    name = product_name

    # replace_punct
    name = spec_add_spaces(name)
    # replace multiple spaces
    name = rm_useless_spaces(name)

    # add stemmer
    name = stemmer.stem(name)
    return name.lower()

In [38]:
# Directory for product data
directory = r'/workspace/search_with_machine_learning_course/data/pruned_products/'
output_file = r'/workspace/datasets/fasttext/output.fasttext'

In [7]:
sample_rate=1.0

items=[]
for filename in os.listdir(directory):
    if filename.endswith(".xml"):
        print("Processing %s" % filename)
        f = os.path.join(directory, filename)
        tree = ET.parse(f)
        root = tree.getroot()
        for child in root:
            if random.random() > sample_rate:
                continue
            # Check to make sure category name is valid
            if (child.find('name') is not None and child.find('name').text is not None and
                child.find('categoryPath') is not None and len(child.find('categoryPath')) > 0 and
                child.find('categoryPath')[len(child.find('categoryPath')) - 1][0].text is not None):
                # Choose last element in categoryPath as the leaf categoryId
                cat = child.find('categoryPath')[len(child.find('categoryPath')) - 1][0].text
                # Replace newline chars with spaces so fastText doesn't complain
                name = child.find('name').text.replace('\n', ' ')
                items.append((cat, transform_name(name)))

In [8]:
import pandas as pd

df = pd.DataFrame(items, columns=['cat', 'name'])

In [9]:
test_eq(len(df), 115358)

In [69]:
def filter_by_min_products(df, minimum=50):
    tmp = pd.DataFrame(df.cat.value_counts())
    min_filter = tmp[tmp['cat'] > minimum]
    return df[df.cat.isin(min_filter.index)]

In [11]:
filtered_df = filter_by_min_products(df, minimum=50)

In [12]:
test_eq(len(filtered_df), 92726)

In [70]:
def write_output(df, output_file = r'/workspace/datasets/fasttext/output.fasttext', minimum=50):
    filtered_df = filter_by_min_products(df, minimum)
    with open(output_file, 'w') as output:
        for _, item in filtered_df.iterrows():
            output.write("__label__%s %s\n" % (item['cat'], item['name']))

### Threshold as 50

In [69]:
write_output(df, r'/workspace/datasets/fasttext/output_50.fasttext', 50)

In [85]:
path='/workspace/datasets/fasttext/output_50.fasttext'
with open(path) as f: data = f.readlines()

In [86]:
random.shuffle(data)
train = data[0:int(len(data) * 0.9)]
test = data[int(len(data) * 0.9):]


In [88]:
len(train), len(test)

(83453, 9273)

In [87]:
train_file = '/workspace/datasets/fasttext/products.train'
test_file = '/workspace/datasets/fasttext/products.test'
with open(train_file, 'w') as f: f.writelines(train)
with open(test_file, 'w') as f: f.writelines(test)

In [89]:
#using optimized params with threshold 50
model = ft.train_supervised(train_file, epoch=25, lr=1.0, wordNgrams=2)

Read 1M words
Number of words:  25722
Number of labels: 512
Progress: 100.0% words/sec/thread:   16468 lr:  0.000000 avg.loss:  0.257775 ETA:   0h 0m 0s 0m42s100.0% words/sec/thread:   16468 lr: -0.000005 avg.loss:  0.257775 ETA:   0h 0m 0s


In [91]:
[print_res(model.test(test_file, k), k) for k in [1, 5, 10]]

N	: 9273 
P@1	: 0.843 
R@1	: 0.843

N	: 9273 
P@5	: 0.195 
R@5	: 0.977

N	: 9273 
P@10	: 0.099 
R@10	: 0.986



[None, None, None]

### Threshold as 100

In [14]:
write_output(df, r'/workspace/datasets/fasttext/output_100.fasttext', 100)

In [15]:
path='/workspace/datasets/fasttext/output_100.fasttext'
with open(path) as f: data = f.readlines()

In [16]:
len(df), len(data)

(115358, 74801)

In [17]:
random.shuffle(data)
train = data[0:int(len(data) * 0.9)]
test = data[int(len(data) * 0.9):]

In [18]:
train_file = '/workspace/datasets/fasttext/products.train'
test_file = '/workspace/datasets/fasttext/products.test'
with open(train_file, 'w') as f: f.writelines(train)
with open(test_file, 'w') as f: f.writelines(test)

In [19]:
#using optimized params with threshold 100
model = ft.train_supervised(train_file, epoch=25, lr=1.0, wordNgrams=2)

Read 0M words
Number of words:  21404
Number of labels: 262
Progress: 100.0% words/sec/thread:   53646 lr:  0.000000 avg.loss:  0.184636 ETA:   0h 0m 0s


In [20]:
[print_res(model.test(test_file, k), k) for k in [1, 5, 10]]

N	: 7481 
P@1	: 0.884 
R@1	: 0.884

N	: 7481 
P@5	: 0.197 
R@5	: 0.986

N	: 7481 
P@10	: 0.099 
R@10	: 0.991



[None, None, None]

### Threshold as 200

In [21]:
write_output(df, r'/workspace/datasets/fasttext/output_200.fasttext', 200)

In [22]:
path='/workspace/datasets/fasttext/output_200.fasttext'
with open(path) as f: data = f.readlines()

In [23]:
len(df), len(data)

(115358, 53654)

In [24]:
random.shuffle(data)
train = data[0:int(len(data) * 0.9)]
test = data[int(len(data) * 0.9):]

In [25]:
train_file = '/workspace/datasets/fasttext/products.train'
test_file = '/workspace/datasets/fasttext/products.test'
with open(train_file, 'w') as f: f.writelines(train)
with open(test_file, 'w') as f: f.writelines(test)

In [26]:
#using optimized params with threshold 100
model = ft.train_supervised(train_file, epoch=25, lr=1.0, wordNgrams=2)

Read 0M words
Number of words:  16045
Number of labels: 113
Progress: 100.0% words/sec/thread:   82599 lr:  0.000000 avg.loss:  0.112276 ETA:   0h 0m 0s


In [27]:
[print_res(model.test(test_file, k), k) for k in [1, 5, 10]]

N	: 5366 
P@1	: 0.926 
R@1	: 0.926

N	: 5366 
P@5	: 0.199 
R@5	: 0.994

N	: 5366 
P@10	: 0.100 
R@10	: 0.998



[None, None, None]

## 3. Replace leaf category with the ancestors (Sample Rate 1.0)



In [41]:
import os
import random
import xml.etree.ElementTree as ET
from pathlib import Path

# Directory for product data
directory = r'/workspace/search_with_machine_learning_course/data/pruned_products/'
output_file = r'/workspace/datasets/fasttext/output.fasttext'

In [6]:
import xml.etree.ElementTree as ET

# Location for category data
categoriesFilename = '/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'

```
<categoryPath>
            <category>
                 <id>cat00000</id>
                 <name>Best Buy</name>
            </category>
            <category>
                 <id>abcat0800000</id>
                 <name>Mobile Phones</name>
            </category>
            <category>
                 <id>pcmcat209400050001</id>
                 <name>All Mobile Phones with Plans</name>
            </category>
       </categoryPath>
```

In [35]:
xml = '''
<categoryPath>
            <category>
                 <id>cat00000</id>
                 <name>Best Buy</name>
            </category>
            <category>
                 <id>abcat0800000</id>
                 <name>Mobile Phones</name>
            </category>
            <category>
                 <id>pcmcat209400050001</id>
                 <name>All Mobile Phones with Plans</name>
            </category>
       </categoryPath>'''

tree = ET.fromstring(xml)

names = []
cat = ''
max_depth = 1
for depth, child in enumerate(tree):
    parent = cat
    cat = child.find('name').text if depth <= max_depth else parent
    names.append(cat)

In [65]:
test_eq(names, ['Best Buy', 'Mobile Phones', 'Mobile Phones'])

In [59]:
filenames = [filename for filename in os.listdir(directory) if filename.endswith(".xml")]

def process_file(filename, sample_rate=1.0, max_depth=None):
    "Replacing the categories with ancestors at max_depth 2 or 3 or" 
    items = []
    print("Processing %s" % filename)
    f = os.path.join(directory, filename)
    tree = ET.parse(f)
    root = tree.getroot()
    for child in root:
        if random.random() > sample_rate:
            continue
        
        # Check to make sure category name is valid
        cname, ccatPath = child.find('name'), child.find('categoryPath')
        
        if (not max_depth and
            cname is not None and cname.text is not None and
            ccatPath is not None and len(ccatPath) > 0 and
            ccatPath[len(ccatPath) - 1][0].text is not None):
            # Choose last element in categoryPath as the leaf categoryId
            cat = ccatPath[len(ccatPath) - 1][0].text
            # Replace newline chars with spaces so fastText doesn't complain
            name = cname.text.replace('\n', ' ')
 
        elif (cname is not None and cname.text is not None and
            ccatPath is not None and len(ccatPath) > 0):
            catStr = ''
            categories = []
            # replace the leaf categories with ancestors set at max depth
            for depth, cchild in enumerate(ccatPath):
                parent = catStr
                catStr = cchild[0].text if depth <= max_depth else parent
                categories.append(catStr)
            
            # Choose the last element since they are replaced with its ancestors
            #print(categories)
            cat = categories[-1]
            
            # Replace newline chars with spaces so fastText doesn't complain
            name = cname.text.replace('\n', ' ')
        items.append((cat, transform_name(name)))

    return items

In [68]:
import pandas as pd

all_items = []

for each in filenames:
    all_items.extend(process_file(each))

## using the leaf categoryId
df = pd.DataFrame(all_items, columns=['cat', 'name'])

### Ancestors at depth 2

In [72]:
all_items = []

for each in filenames:
    all_items.extend(process_file(each, max_depth=2))

## using the leaf categoryOd
df = pd.DataFrame(all_items, columns=['cat', 'name'])

Processing pruned_products_1.xml
Processing pruned_products_2.xml
Processing pruned_products_3.xml
Processing pruned_products_4.xml
Processing pruned_products_5.xml
Processing pruned_products_6.xml


In [73]:
write_output(df, r'/workspace/datasets/fasttext/output_depth2.fasttext')

In [74]:
path='/workspace/datasets/fasttext/output_depth2.fasttext'
with open(path) as f: data = f.readlines()

random.shuffle(data)
train = data[0:int(len(data) * 0.9)]
test = data[int(len(data) * 0.9):]

train_file = '/workspace/datasets/fasttext/products.train'
test_file = '/workspace/datasets/fasttext/products.test'
with open(train_file, 'w') as f: f.writelines(train)
with open(test_file, 'w') as f: f.writelines(test)

#using optimized params with threshold 50
model = ft.train_supervised(train_file, epoch=25, lr=1.0, wordNgrams=2)

[print_res(model.test(test_file, k), k) for k in [1, 5, 10]]

Read 1M words
Number of words:  29979
Number of labels: 124
Progress: 100.0% words/sec/thread:   73803 lr:  0.000000 avg.loss:  0.071780 ETA:   0h 0m 0s


N	: 11445 
P@1	: 0.947 
R@1	: 0.947

N	: 11445 
P@5	: 0.198 
R@5	: 0.991

N	: 11445 
P@10	: 0.099 
R@10	: 0.995



[None, None, None]

### Ancestors at depth 3

In [75]:
all_items = []

for each in filenames:
    all_items.extend(process_file(each, max_depth=3))

## using the leaf categoryOd
df = pd.DataFrame(all_items, columns=['cat', 'name'])

Processing pruned_products_1.xml
Processing pruned_products_2.xml
Processing pruned_products_3.xml
Processing pruned_products_4.xml
Processing pruned_products_5.xml
Processing pruned_products_6.xml


In [76]:
write_output(df, r'/workspace/datasets/fasttext/output_depth3.fasttext')

In [77]:
path='/workspace/datasets/fasttext/output_depth3.fasttext'
with open(path) as f: data = f.readlines()

random.shuffle(data)
train = data[0:int(len(data) * 0.9)]
test = data[int(len(data) * 0.9):]

train_file = '/workspace/datasets/fasttext/products.train'
test_file = '/workspace/datasets/fasttext/products.test'
with open(train_file, 'w') as f: f.writelines(train)
with open(test_file, 'w') as f: f.writelines(test)

#using optimized params with threshold 50
model = ft.train_supervised(train_file, epoch=25, lr=1.0, wordNgrams=2)

[print_res(model.test(test_file, k), k) for k in [1, 5, 10]]

Read 1M words
Number of words:  28326
Number of labels: 346
Progress: 100.0% words/sec/thread:   43070 lr:  0.000000 avg.loss:  0.135904 ETA:   0h 0m 0s 59.0% words/sec/thread:   42863 lr:  0.410221 avg.loss:  0.209755 ETA:   0h 0m18s100.0% words/sec/thread:   43070 lr: -0.000013 avg.loss:  0.135904 ETA:   0h 0m 0s


N	: 10809 
P@1	: 0.920 
R@1	: 0.920

N	: 10809 
P@5	: 0.197 
R@5	: 0.984

N	: 10809 
P@10	: 0.099 
R@10	: 0.990



[None, None, None]