In [1]:
import argparse
import os
import random
import re
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd
import numpy as np


In [2]:
directory = r'/workspace/search_with_machine_learning_course/data/pruned_products'
output_file = r'/workspace/datasets/fasttext/titles.txt'

In [3]:

def transform_training_data(name):
    name = name.lower()
    re.sub(r'[^\w]', ' ', name)

    return name.replace('\n', ' ')


In [None]:
titles = []
sample_rate = 1

for filename in os.listdir(directory):
    if filename.endswith(".xml"):
        f = os.path.join(directory, filename)
        tree = ET.parse(f)
        root = tree.getroot()
        for child in root:
            if random.random() > 0.1:
                continue
            if (child.find('name') is not None and child.find('name').text is not None):
                name = transform_training_data(child.find('name').text)
                titles.append(name)

In [None]:
df = pd.DataFrame(titles)

In [6]:
df.size

11448

In [7]:
df.head(3)

Unnamed: 0,0
0,lg - nitro hd 4g mobile phone - black (at&t)
1,delorme - inreach 2-way satellite communicator...
2,memorex - slim jewel cases (30-pack) - assorted


In [8]:
 import fasttext

In [11]:
np.savetxt(r"/workspace/datasets/fasttext/titles.txt", df.values, fmt='%s')

In [12]:
model = fasttext.train_unsupervised(input="/workspace/datasets/fasttext/titles.txt", model='skipgram')

Read 0M words
Number of words:  2471
Number of labels: 0
Progress: 100.0% words/sec/thread:   64564 lr:  0.000000 avg.loss:  2.702547 ETA:   0h 0m 0s


In [53]:
model.get_nearest_neighbors("iphone")

[(0.5816837549209595, 'iphone®'),
 (0.56773841381073, 'apple®'),
 (0.5520244836807251, 'iphone®,'),
 (0.5504394769668579, '3gs'),
 (0.5457756519317627, '3g/3g'),
 (0.5380159020423889, 'iphoneâ®'),
 (0.5269085168838501, 'incase'),
 (0.506147027015686, '050'),
 (0.5046433210372925, 'elan'),
 (0.49777185916900635, 'apple')]

In [68]:
model = fasttext.train_unsupervised(input="/workspace/datasets/fasttext/titles.txt", model='skipgram', lr=0.05, epoch=25, minCount=10)

Read 0M words
Number of words:  364
Number of labels: 0
Progress: 100.0% words/sec/thread:  219558 lr:  0.000000 avg.loss:  2.199552 ETA:   0h 0m 0s


In [69]:
model.get_nearest_neighbors("iphone")

[(0.9258624315261841, 'iphone®'),
 (0.8793050646781921, 'apple®'),
 (0.8637220859527588, '3gs'),
 (0.8134299516677856, 'apple'),
 (0.8021686673164368, '4'),
 (0.799534261226654, '3g/3g'),
 (0.7931521534919739, 'incase'),
 (0.7760315537452698, '3g/3gs'),
 (0.7680350542068481, '3g'),
 (0.7558871507644653, 'luxe')]

In [47]:
import json

products = [
    "iphone",
    "samsung TV",
    "LCD TV",
    "bose soundbar"
]

for product in products:
    res = model.get_nearest_neighbors(product)
    
    for item in res:
        print(item[0])
    
    
    

0.9260885715484619
0.7860994338989258
0.7013247013092041
0.6790842413902283
0.6398615837097168
0.6282010674476624
0.6272938847541809
0.6165014505386353
0.6139096617698669
0.6094344258308411
0.9882978796958923
0.6968414783477783
0.5811522603034973
0.517500638961792
0.5130367279052734
0.49001213908195496
0.48963284492492676
0.47938981652259827
0.4731709957122803
0.47283244132995605
0.276762992143631
0.2600076496601105
0.25027015805244446
0.246711865067482
0.24017852544784546
0.23896008729934692
0.23508991301059723
0.2281198650598526
0.22698526084423065
0.2262667566537857
0.8449680209159851
0.821498692035675
0.8123121857643127
0.8023239374160767
0.7758312821388245
0.745962917804718
0.6897310614585876
0.6827021241188049
0.6805792450904846
0.6768749356269836


In [44]:
model.save_model("/workspace/search_with_machine_learning_course/model_titles.bin")

In [46]:
# Trained the phone model:
# ~/fastText-0.9.2/fasttext skipgram -input /workspace/datasets/fasttext/titles.txt -output /workspace/datasets/fasttext/phone_model -epoch 25