In [1]:
import argparse
import os
import random
import re
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd
import numpy as np


In [2]:
directory = r'/workspace/search_with_machine_learning_course/data/pruned_products'
output_file = r'/workspace/datasets/fasttext/titles.txt'

In [3]:

def transform_training_data(name):
    name = name.lower()
    name = re.sub("[^0-9a-zA-Z]+", " ", name)
    name = re.sub(r'[^\w]|\t|\n', ' ', name)
    
    return name


In [4]:
titles = []
sample_rate = 1

for filename in os.listdir(directory):
    if filename.endswith(".xml"):
        f = os.path.join(directory, filename)
        tree = ET.parse(f)
        root = tree.getroot()
        for child in root:
            if random.random() > 0.1:
                continue
            if (child.find('name') is not None and child.find('name').text is not None):
                name = transform_training_data(child.find('name').text)
                titles.append(name)

In [5]:
df = pd.DataFrame(titles)

In [6]:
df.size

11455

In [7]:
df.head(3)

Unnamed: 0,0
0,linksys desktop 10 100 pci network card
1,pioneer 50w x 4 mosfet apple ipod hd radio rea...
2,sauder big screen tv home entertainment center


In [8]:
 import fasttext

In [9]:
np.savetxt(r"/workspace/datasets/fasttext/titles.txt", df.values, fmt='%s')

In [10]:
model = fasttext.train_unsupervised(input="/workspace/datasets/fasttext/titles.txt", model='skipgram')

Read 0M words
Number of words:  2357
Number of labels: 0
Progress: 100.0% words/sec/thread:   61208 lr:  0.000000 avg.loss:  2.759538 ETA:   0h 0m 0s100.1% words/sec/thread:   61232 lr: -0.000027 avg.loss:  2.759538 ETA:   0h 0m 0s


In [11]:
model.get_nearest_neighbors("iphone")

[(0.9867191314697266, 'phones'),
 (0.9778130650520325, 'phone'),
 (0.9739911556243896, 'earphones'),
 (0.9707987904548645, 'headphone'),
 (0.9559227228164673, 'headphones'),
 (0.9531955122947693, 'mobile'),
 (0.944339394569397, 'microphone'),
 (0.9416788220405579, 'microphones'),
 (0.9328449368476868, 'smartphones'),
 (0.9210118651390076, 'apple')]

In [12]:
model = fasttext.train_unsupervised(input="/workspace/datasets/fasttext/titles.txt", model='skipgram', epoch=20, minCount=30)

Read 0M words
Number of words:  556
Number of labels: 0
Progress: 100.0% words/sec/thread:  122261 lr:  0.000000 avg.loss:  1.739114 ETA:   0h 0m 0s


In [13]:
model.get_nearest_neighbors("iphone")

[(0.8645359873771667, '4s'),
 (0.7719354629516602, 'apple'),
 (0.7404201626777649, 'ipod'),
 (0.6853405237197876, 'phone'),
 (0.6746233701705933, 'shell'),
 (0.6469794511795044, 'at'),
 (0.6440153121948242, 'pangea'),
 (0.6385045051574707, 'ipad'),
 (0.6273909211158752, 'generation'),
 (0.6253519058227539, '4th')]

In [14]:
import json

products = [
    "iphone",
    "razr",
    "blackberry",

    "ipad",
    "lenovo",
    "thinkpad",
    
    "sony",
    "samsung",
    "sanyo",
    
    "xbox",
    "playstation",
    "nintendo",
    "mario",
]

for product in products:
    res = model.get_nearest_neighbors(product)
    
    print("Product:", product)
    for item in res:
        print(item[1], item[0])
        
    print("\n")
    
    
    

Product: iphone
4s 0.8645359873771667
apple 0.7719354629516602
ipod 0.7404201626777649
phone 0.6853405237197876
shell 0.6746233701705933
at 0.6469794511795044
pangea 0.6440153121948242
ipad 0.6385045051574707
generation 0.6273909211158752
4th 0.6253519058227539


Product: razr
ray 0.7641907334327698
blu 0.7396956086158752
oven 0.7262805104255676
range 0.7146920561790466
hood 0.7011908292770386
cooktop 0.693711519241333
convection 0.6768168210983276
built 0.6743932366371155
dishwasher 0.6674908995628357
rack 0.6462004780769348


Product: blackberry
htc 0.8313192129135132
mobile 0.7836642861366272
phones 0.7506168484687805
motorola 0.7405303120613098
unlocked 0.7362288236618042
phone 0.6950275301933289
rocketfish 0.6755998730659485
leather 0.6607849597930908
cell 0.6602796912193298
shell 0.6408994793891907


Product: ipad
sleeve 0.7500386834144592
apple 0.7427287697792053
kindle 0.7178682684898376
generation 0.7024156451225281
cover 0.6983477473258972
targus 0.6829394698143005
accessorie

In [15]:
model.save_model("/workspace/search_with_machine_learning_course/model_titles.bin")

In [46]:
# Trained the phone model:
# ~/fastText-0.9.2/fasttext skipgram -input /workspace/datasets/fasttext/titles.txt -output /workspace/datasets/fasttext/phone_model -epoch 25