In [1]:
import argparse
import os
import random
import re
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd
import numpy as np


In [2]:
directory = r'/workspace/search_with_machine_learning_course/data/pruned_products'
output_file = r'/workspace/datasets/fasttext/titles.txt'

In [107]:

def transform_training_data(name):
    name = name.lower()
    name = re.sub("[^0-9a-zA-Z]+", " ", name)
    name = re.sub(r'[^\w]|\t|\n', ' ', name)
    
    return name


In [108]:
titles = []
sample_rate = 1

for filename in os.listdir(directory):
    if filename.endswith(".xml"):
        f = os.path.join(directory, filename)
        tree = ET.parse(f)
        root = tree.getroot()
        for child in root:
            if random.random() > 0.1:
                continue
            if (child.find('name') is not None and child.find('name').text is not None):
                name = transform_training_data(child.find('name').text)
                titles.append(name)

In [109]:
df = pd.DataFrame(titles)

In [110]:
df.size

11470

In [111]:
df.head(3)

Unnamed: 0,0
0,hp 60 ink cartridge twin pack black
1,pantech pocket 4g mobile phone black at t
2,super mario 3d land game guide nintendo 3ds


In [112]:
 import fasttext

In [113]:
np.savetxt(r"/workspace/datasets/fasttext/titles.txt", df.values, fmt='%s')

In [114]:
model = fasttext.train_unsupervised(input="/workspace/datasets/fasttext/titles.txt", model='skipgram')

Read 0M words
Number of words:  2332
Number of labels: 0
Progress: 100.0% words/sec/thread:   52034 lr:  0.000000 avg.loss:  2.601377 ETA:   0h 0m 0s


In [115]:
model.get_nearest_neighbors("iphone")

[(0.9850426316261292, 'phones'),
 (0.9826255440711975, 'phone'),
 (0.9485659599304199, 'mobile'),
 (0.941804826259613, 'microphone'),
 (0.9377111196517944, 'motorola'),
 (0.9341446161270142, 'microphones'),
 (0.9266576170921326, 'smartphones'),
 (0.925964891910553, 'silicone'),
 (0.92539381980896, 'zone'),
 (0.9229100942611694, 'apple')]

In [132]:
model = fasttext.train_unsupervised(input="/workspace/datasets/fasttext/titles.txt", model='skipgram', epoch=20, minCount=30)

Read 0M words
Number of words:  545
Number of labels: 0
Progress: 100.0% words/sec/thread:  120485 lr:  0.000000 avg.loss:  1.738667 ETA:   0h 0m 0s


In [133]:
model.get_nearest_neighbors("iphone")

[(0.7442602515220642, 'apple'),
 (0.7395263910293579, 'ipod'),
 (0.7249903082847595, 'shell'),
 (0.7109566926956177, '4th'),
 (0.693930983543396, 'tribeca'),
 (0.6787998676300049, '3rd'),
 (0.6646546721458435, 'generation'),
 (0.6535366773605347, 'phone'),
 (0.646971583366394, 'nano'),
 (0.6404775381088257, 'ipad')]

In [134]:
import json

products = [
    "iphone",
    "samsung TV",
    "LCD TV",
    "xbox"
]

for product in products:
    res = model.get_nearest_neighbors(product)
    
    print("Product:", product)
    for item in res:
        print(item[1], item[0])
        
    print("\n")
    
    
    

Product: iphone
apple 0.7442602515220642
ipod 0.7395263910293579
shell 0.7249903082847595
4th 0.7109566926956177
tribeca 0.693930983543396
3rd 0.6787998676300049
generation 0.6646546721458435
phone 0.6535366773605347
nano 0.646971583366394
ipad 0.6404775381088257


Product: samsung TV
samsung 0.9917891621589661
lg 0.6236541271209717
sony 0.5464914441108704
toshiba 0.5287562608718872
galaxy 0.5186451077461243
3d 0.5137705206871033
htc 0.5132042169570923
hdtv 0.5097765326499939
zagg 0.504116415977478
120hz 0.5033263564109802


Product: LCD TV
windows 0.3449544608592987
mac 0.30808645486831665
adobe 0.2856835722923279
deluxe 0.272208034992218
kitchenaid 0.265127032995224
cup 0.2624920606613159
platinum 0.2599470913410187
stand 0.25896310806274414
biscuit 0.25886476039886475
standard 0.2516326606273651


Product: xbox
360 0.9653497934341431
guide 0.8618823885917664
playstation 0.8456032872200012
bundle 0.8282456994056702
ps2 0.7984563708305359
controller 0.7926873564720154
gamecube 0.78585

In [135]:
model.save_model("/workspace/search_with_machine_learning_course/model_titles.bin")

In [46]:
# Trained the phone model:
# ~/fastText-0.9.2/fasttext skipgram -input /workspace/datasets/fasttext/titles.txt -output /workspace/datasets/fasttext/phone_model -epoch 25