# Derive Synonyms from Content

In [1]:
from fastcore.foundation import L
from fastcore.test import *

In [2]:
import re
import string
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language='english')
_re_spec = re.compile(r'([/#\\-\\.:])')

def spec_add_spaces(t):
    "Add spaces around . : - / \ and #"
    return _re_spec.sub(r' \1 ', t)

# Causes the resulting RE to match from m to n repetitions of the preceding RE, attempting to match as many repetitions as possible.
_re_space = re.compile(' {2,}')

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return _re_space.sub(' ', t)

def rm_punct(t):
    for p in string.punctuation:
        t = t.replace(p, ' ')
    return t

def transform_training_data(product_name):
    "Transform product name by replacing punctuations, removing multiple spaces, stemming, lower casing"
    name = product_name

    # replace_punct
    name = spec_add_spaces(name)
    
    # remove punct
    name = rm_punct(name)
    
    # replace multiple spaces
    name = rm_useless_spaces(name)
    
    # fix registered, trademark, copyright symbol
    # remove non-ascii characters from name
    name = name.encode(encoding='ascii', errors='ignore').decode()
    
    name = ' '.join([o for o in name.split(' ') if not o.isnumeric()])
    
    # add stemmer
    name = stemmer.stem(name)
    return name.lower()

In [53]:
name='NuForce - Icon uDAC-2 USB Audio Receiver and Digital-to-Analog Converter - Silver'

In [54]:
transform_training_data('NuForce - Icon uDAC-2 USB Audio Receiver and Digital-to-Analog Converter - Silver')

'nuforce icon udac usb audio receiver and digital to analog converter silv'

In [56]:
test_str = 'iphone®™'

In [57]:
transform_training_data(test_str)

'iphon'

The utility extractTitles.py in the week3 directory goes through all of the products in the pruned XML files (i.e., everything except music and movies) and outputs their titles as plain text.

In [1]:
!python extractTitles.py

Writing results to /workspace/datasets/fasttext/titles.txt


 By default, it will save the extracted output to /workspace/datasets/fasttext/titles.txt. Also, by default, it will only sample 10% of the product titles, but you can change that with the -sample_rate parameter. 

In [55]:
!tail /workspace/datasets/fasttext/titles.txt

maytag cu ft cycle gas dryer whit
whirlpool self cleaning drop in electric oven biscuit
maytag cu ft cycle supersize capacity plus washer whit
maytag cu ft cycle gas steam dryer whit
whirlpool self cleaning drop in electric range whit
whirlpool self cleaning drop in electric oven black
maytag cu ft cycle gas dryer whit
eric clapton clapton acoustic dvd
greg koch guitar gristle dvd
pro tools le beginner level dvd


In [3]:
!/home/gitpod/fastText-0.9.2/fasttext skipgram -input /workspace/datasets/fasttext/titles.txt -output /workspace/datasets/fasttext/title_model

Read 0M words
Number of words:  2535
Number of labels: 0
Progress: 100.0% words/sec/thread:    3663 lr:  0.000000 avg.loss:  2.528054 ETA:   0h 0m 0s 0.008222 avg.loss:  2.611043 ETA:   0h 0m 2s 2.586558 ETA:   0h 0m 1s


In [4]:
!/home/gitpod/fastText-0.9.2/fasttext nn /workspace/datasets/fasttext/title_model.bin

Query word? ^C


In [5]:
import fasttext as ft

In [6]:
model=ft.train_unsupervised('/workspace/datasets/fasttext/titles.txt')

Read 0M words
Number of words:  2535
Number of labels: 0
Progress: 100.0% words/sec/thread:   55515 lr:  0.000000 avg.loss:  2.608343 ETA:   0h 0m 0s


In [10]:
L(model.words)

(#2535) ['-','</s>','for','Black','with','and','Digital','Case','Memory','/'...]

In [12]:
model.get_nearest_neighbors('iphone')

[(0.9725721478462219, 'Phones'),
 (0.9674215316772461, 'Phone'),
 (0.9665066003799438, 'Motorola'),
 (0.9517971873283386, 'Microphone'),
 (0.9450824856758118, 'iPhone'),
 (0.9434287548065186, 'No-Contract'),
 (0.9432931542396545, 'Headphones'),
 (0.9296050071716309, 'Microphones'),
 (0.9289809465408325, 'Smartphones'),
 (0.9250837564468384, 'Nokia')]

Establish a set of 20 tokens that you’ll use for evaluation. In order to get a good overall sense, you’ll want some variety: 
e.g., 5 product types (e.g., headphones), 5 brands (e.g., sony), 5 models (e.g., thinkpad) and 5 attributes (e.g., black)

types: headphones, televisions, refrigerator, washing machines, smartwatch

brands: sony, samsung, bosch, apple, skullcandy

models: iphone, ipad, thinkpad, iwatch, macbook

attributes: black, red, waterproof, wireless, bluetooth

## Extract Titles

In [27]:
import os
import random
import xml.etree.ElementTree as ET
import argparse
from pathlib import Path

directory = r'/workspace/search_with_machine_learning_course/data/pruned_products'
output_file = "/workspace/datasets/fasttext/titles.txt"

# def transform_training_data(name):
#     # IMPLEMENT
#     return name.replace('\n', ' ')

# Directory for product data
filenames = [filename for filename in os.listdir(directory) if filename.endswith(".xml")]

def process_file(filename, sample_rate=0.1):
    items = []
    print("Processing %s" % filename)
    f = os.path.join(directory, filename)
    tree = ET.parse(f)
    root = tree.getroot()
    for child in root:
        if random.random() > sample_rate:
            continue
        if (child.find('name') is not None and child.find('name').text is not None):
            name = transform_training_data(child.find('name').text)
            items.append(name)
            #output.write(name + "\n")
    return items

# print("Writing results to %s" % output_file)
# with open(output_file, 'w') as output:
#     for filename in os.listdir(directory):
#         if filename.endswith(".xml"):
#             f = os.path.join(directory, filename)
#             tree = ET.parse(f)
#             root = tree.getroot()
#             for child in root:
#                 if random.random() > sample_rate:
#                     continue
#                 if (child.find('name') is not None and child.find('name').text is not None):
#                     name = transform_training_data(child.find('name').text)
#                     output.write(name + "\n")

In [28]:
all_items = []

for each in filenames:
    all_items.extend(process_file(each))

Processing pruned_products_1.xml
Processing pruned_products_2.xml
Processing pruned_products_3.xml
Processing pruned_products_4.xml
Processing pruned_products_5.xml
Processing pruned_products_6.xml


In [29]:
len(all_items)

11358

In [30]:
L(all_items)

(#11358) ['pioneer 50w x mosfet apple® ipod® hd radio ready in dash cd deck','pantech pocket 4g mobile phone black at t ','orphen scion of sorcery playstation ps2 ','sauder audio video storage cabinet sky ald','wipeout ps vita','jura ena one touch cappuccino and latte macchiato maker silv','apple itunes gift card','toshiba satellite laptop intel® core™ i3 processor display matrix graphit','airline tycoon window','best buy gc have a ball gift card'...]

In [31]:
import nltk
nltk.download('punkt')

tokens = L(all_items).map(nltk.word_tokenize)

[nltk_data] Downloading package punkt to /home/gitpod/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [32]:
all_tokens = []
for each in tokens: all_tokens.extend(each)

In [33]:
from collections import Counter
tok_count = Counter(all_tokens)

In [34]:
tok_count.most_common(20)

[('black', 2037),
 ('for', 1669),
 ('with', 1106),
 ('and', 995),
 ('digital', 798),
 ('case', 683),
 ('camera', 669),
 ('memory', 662),
 ('window', 617),
 ('whit', 603),
 ('card', 575),
 ('cu', 573),
 ('ft', 573),
 ('the', 563),
 ('apple®', 532),
 ('in', 514),
 ('electric', 454),
 ('nintendo', 439),
 ('laptop', 426),
 ('sony', 415)]

In [35]:
# 10 rare words
tok_count.most_common()[-10:-1]

[('satellites', 1),
 ('sparkcash', 1),
 ('ovation', 1),
 ('celebrity', 1),
 ('h550', 1),
 ('cannon', 1),
 ('position', 1),
 ('capcell', 1),
 ('positive', 1)]

In [36]:
# 20 rare words
tok_count.most_common()[-20:-1]

[('humminbird', 1),
 ('343c', 1),
 ('painter', 1),
 ('northstar', 1),
 ('blueant', 1),
 ('supertooth', 1),
 ('spectrobes', 1),
 ('portals', 1),
 ('allman', 1),
 ('recovering', 1),
 ('satellites', 1),
 ('sparkcash', 1),
 ('ovation', 1),
 ('celebrity', 1),
 ('h550', 1),
 ('cannon', 1),
 ('position', 1),
 ('capcell', 1),
 ('positive', 1)]

In [38]:
# 50 rare words
print(tok_count.most_common()[-50:-1])

[('clash', 1), ('chili', 1), ('uno', 1), ('radiant', 1), ('reckoning', 1), ('girlz', 1), ('really', 1), ('o2', 1), ('greas', 1), ('s9hd', 1), ('beaten', 1), ('nickelback', 1), ('reasons', 1), ('some', 1), ('mothership', 1), ('lips', 1), ('1950s', 1), ('pay', 1), ('scorpio', 1), ('eide', 1), ('designjet', 1), ('cradle', 1), ('linx', 1), ('trailblazer', 1), ('binoculars', 1), ('enclosures', 1), ('f150', 1), ('supercab', 1), ('hatchbacks', 1), ('sale', 1), ('humminbird', 1), ('343c', 1), ('painter', 1), ('northstar', 1), ('blueant', 1), ('supertooth', 1), ('spectrobes', 1), ('portals', 1), ('allman', 1), ('recovering', 1), ('satellites', 1), ('sparkcash', 1), ('ovation', 1), ('celebrity', 1), ('h550', 1), ('cannon', 1), ('position', 1), ('capcell', 1), ('positive', 1)]


## Sample Rate 0.1

In [39]:
!python extractTitles.py

Writing results to /workspace/datasets/fasttext/titles.txt


In [40]:
import fasttext as ft
model=ft.train_unsupervised('/workspace/datasets/fasttext/titles.txt')

Read 0M words
Number of words:  2348
Number of labels: 0
Progress: 100.0% words/sec/thread:   30747 lr:  0.000000 avg.loss:  2.642882 ETA:   0h 0m 0s100.0% words/sec/thread:   30752 lr: -0.000011 avg.loss:  2.642882 ETA:   0h 0m 0s


In [41]:
print(model.words[:50])

['</s>', 'black', 'for', 'with', 'and', 'digital', 'camera', 'case', 'memory', 'window', 'whit', 'card', 'cu', 'ft', 'the', 'in', 'apple®', 'nintendo', 'electric', 'laptop', 'sony', 'wireless', 'series', 'steel', 'stainless', 'playstation', 'guitar', 'hard', 'ge', 'refrigerator', 'system', 'pack', 'side', 'mobile', 's', 'samsung', 'extra', 'range', 'kit', 'on', 'game', 'battery', 'usb', 'bag', 'select', 'd', 'built', 'silv', 'xbox', 'r']


In [42]:
model.get_nearest_neighbors('iphone')

[(0.9884212613105774, 'phones'),
 (0.987588107585907, 'iphone®'),
 (0.9804374575614929, 'gophone'),
 (0.9657481908798218, 'phone'),
 (0.9601891040802002, 'smartphones'),
 (0.9589815139770508, 'mobile'),
 (0.9587773084640503, 'headphone'),
 (0.9553311467170715, 'phon'),
 (0.947689950466156, 'microphone'),
 (0.9473929405212402, 'microphones')]

In [7]:
def evaluate(model, k=4):
    types='headphones,televisions,refrigerator,washing machines,smartwatch'
    brands='sony,samsung,bosch,apple,skullcandy'
    models='iphone,ipad,thinkpad,iwatch,macbook'
    attributes='black,red,waterproof,wireless,bluetooth'

    evaluation=[]
    for o in [types, brands, models, attributes]: evaluation.extend(o.split(','))
    
    return [(o, model.get_nearest_neighbors(o, k=k)) for o in evaluation]

In [43]:
types='headphones,televisions,refrigerator,washing machines,smartwatch'
brands='sony,samsung,bosch,apple,skullcandy'
models='iphone,ipad,thinkpad,iwatch,macbook'
attributes='black,red,waterproof,wireless,bluetooth'
evaluation=[]
for o in [types, brands, models, attributes]: evaluation.extend(o.split(','))

In [44]:
[(o, model.get_nearest_neighbors(o, k=4)) for o in evaluation]

[('headphones',
  [(0.9919744729995728, 'headphone'),
   (0.9893032908439636, 'microphones'),
   (0.987011194229126, 'headphon'),
   (0.9685732126235962, 'microphone')]),
 ('televisions',
  [(0.9955301284790039, 'dolphins'),
   (0.99502032995224, 'anaheim'),
   (0.9947879314422607, 'ibm'),
   (0.9944508075714111, 'battlefield')]),
 ('refrigerator',
  [(0.9975049495697021, 'by'),
   (0.9957539439201355, 'thru'),
   (0.9954434037208557, 'refriger'),
   (0.993172287940979, 'side')]),
 ('washing machines',
  [(0.9951592087745667, 'maple'),
   (0.9936034679412842, 'vertical'),
   (0.9935685992240906, 'washington'),
   (0.9935431480407715, 'marshmallow')]),
 ('smartwatch',
  [(0.9890354871749878, 'fat'),
   (0.98868328332901, 'bravo'),
   (0.9883300065994263, 'house'),
   (0.9879694581031799, 'patch')]),
 ('sony',
  [(0.9773331880569458, 'dcr'),
   (0.9768596291542053, 'everio'),
   (0.9755401611328125, 'pink'),
   (0.9712700247764587, 'kodak')]),
 ('samsung',
  [(0.9749250411987305, 'sharp'

## Sample Rate 1.0

In [58]:
!python extractTitles.py --sample_rate 1.0

Writing results to /workspace/datasets/fasttext/titles.txt


In [59]:
!wc -l /workspace/datasets/fasttext/titles.txt

115358 /workspace/datasets/fasttext/titles.txt


In [5]:
import fasttext as ft
model=ft.train_unsupervised('/workspace/datasets/fasttext/titles.txt', lr=0.09, epoch=15)

Read 1M words
Number of words:  8651
Number of labels: 0
Progress: 100.0% words/sec/thread:   51909 lr:  0.000000 avg.loss:  1.027117 ETA:   0h 0m 0s 62.9% words/sec/thread:   52041 lr:  0.033368 avg.loss:  1.101303 ETA:   0h 0m 7s 84.8% words/sec/thread:   51983 lr:  0.013650 avg.loss:  1.050967 ETA:   0h 0m 2s


In [65]:
model.lr, model.epoch, model.minCount, model.wordNgrams

(0.05, 5, 5, 1)

In [49]:
ft.train_unsupervised??

[0;31mSignature:[0m [0mft[0m[0;34m.[0m[0mtrain_unsupervised[0m[0;34m([0m[0;34m*[0m[0mkargs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mtrain_unsupervised[0m[0;34m([0m[0;34m*[0m[0mkargs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""[0m
[0;34m    Train an unsupervised model and return a model object.[0m
[0;34m[0m
[0;34m    input must be a filepath. The input text does not need to be tokenized[0m
[0;34m    as per the tokenize function, but it must be preprocessed and encoded[0m
[0;34m    as UTF-8. You might want to consult standard preprocessing scripts such[0m
[0;34m    as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html[0m
[0;34m[0m
[0;34m    The input field must not contain any labels or use the specified label prefix[0m
[0;34m    unless it is ok for those words to be ignored. For an example

In [8]:
evaluate(model, k=10)

[('headphones',
  [(0.8894093632698059, 'earbud'),
   (0.847713828086853, 'headphon'),
   (0.835752546787262, 'headphone'),
   (0.7844434976577759, 'ear'),
   (0.679629921913147, 'neckband'),
   (0.675812840461731, 'piiq'),
   (0.670875608921051, 'adidas'),
   (0.6686697602272034, 'superbudz'),
   (0.6669088006019592, 'earpollution'),
   (0.6554350256919861, 'bud')]),
 ('televisions',
  [(0.9134686589241028, 'television'),
   (0.8775928616523743, 'televis'),
   (0.6211571097373962, 'purevision'),
   (0.6000802516937256, 'tvs'),
   (0.5959171056747437, 'sunbritetv'),
   (0.5912543535232544, 'visionmount'),
   (0.5735495090484619, 'panel'),
   (0.5721613764762878, 'sb'),
   (0.5587849617004395, 'highboy'),
   (0.5579285025596619, 'pole')]),
 ('refrigerator',
  [(0.8945291638374329, 'refrigerators'),
   (0.8129922151565552, 'side'),
   (0.7540073990821838, 'cu'),
   (0.7342646718025208, 'ft'),
   (0.7291808128356934, 'freezer'),
   (0.7241199612617493, 'satina'),
   (0.7237869501113892, '

In [67]:
model10=ft.train_unsupervised('/workspace/datasets/fasttext/titles.txt', minCount=10)

Read 1M words
Number of words:  5743
Number of labels: 0
Progress: 100.0% words/sec/thread:   52409 lr:  0.000000 avg.loss:  1.410350 ETA:   0h 0m 0s


In [71]:
evaluate(model10, k=5)

[('headphones',
  [(0.93272465467453, 'headphon'),
   (0.9270111322402954, 'headphone'),
   (0.8778688311576843, 'earbud'),
   (0.8217599391937256, 'ear'),
   (0.8085653185844421, 'earphones')]),
 ('televisions',
  [(0.8794519305229187, 'television'),
   (0.8161981701850891, 'visionmount'),
   (0.7831764817237854, 'whalen'),
   (0.7460930347442627, 'sunbritetv'),
   (0.7412470579147339, 'wega')]),
 ('refrigerator',
  [(0.9694816470146179, 'refrigerators'),
   (0.880591869354248, 'side'),
   (0.8579696416854858, 'monochromatic'),
   (0.8534564971923828, 'ice2o'),
   (0.84491366147995, 'panorama')]),
 ('washing machines',
  [(0.821678638458252, 'washington'),
   (0.7737298607826233, 'panthers'),
   (0.773579478263855, 'machines'),
   (0.7730887532234192, 'dolphins'),
   (0.7478132247924805, 'chinese')]),
 ('smartwatch',
  [(0.8082565665245056, 'watch'),
   (0.7544666528701782, 'smartpens'),
   (0.7412256598472595, 'smarthome'),
   (0.7376159429550171, 'smartpen'),
   (0.7237333655357361,

In [72]:
model20=ft.train_unsupervised('/workspace/datasets/fasttext/titles.txt', minCount=20)

Read 1M words
Number of words:  3749
Number of labels: 0
Progress: 100.0% words/sec/thread:   38912 lr:  0.000000 avg.loss:  1.406713 ETA:   0h 0m 0s


In [73]:
evaluate(model20, k=5)

[('headphones',
  [(0.9300025105476379, 'headphon'),
   (0.919276773929596, 'headphone'),
   (0.8799359202384949, 'earbud'),
   (0.8216578960418701, 'earphones'),
   (0.8151381611824036, 'ear')]),
 ('televisions',
  [(0.7140693664550781, 'nhl'),
   (0.7081529498100281, 'tekken'),
   (0.7047365307807922, 'chiefs'),
   (0.7033829092979431, 'tennis'),
   (0.7004234790802002, 'wwe')]),
 ('refrigerator',
  [(0.8941556215286255, 'side'),
   (0.8849811553955078, 'monochromatic'),
   (0.8507714867591858, 'satina'),
   (0.8274998664855957, 'customstyle'),
   (0.8258361220359802, 'counter')]),
 ('washing machines',
  [(0.8135136365890503, 'washington'),
   (0.7707059383392334, 'emachines'),
   (0.7659292221069336, 'texans'),
   (0.7599278092384338, 'toronto'),
   (0.754831075668335, 'jets')]),
 ('smartwatch',
  [(0.7960307002067566, 'watch'),
   (0.6941384077072144, 'smart'),
   (0.6786313652992249, 'smarthome'),
   (0.6567648649215698, 'armitron'),
   (0.6511904001235962, 'schedule')]),
 ('sony

In [74]:
model50=ft.train_unsupervised('/workspace/datasets/fasttext/titles.txt', minCount=50)
evaluate(model50, k=5)

Read 1M words
Number of words:  2062
Number of labels: 0
Progress: 100.0% words/sec/thread:   41776 lr:  0.000000 avg.loss:  1.400826 ETA:   0h 0m 0s100.0% words/sec/thread:   41777 lr: -0.000001 avg.loss:  1.400826 ETA:   0h 0m 0s


[('headphones',
  [(0.936922550201416, 'headphon'),
   (0.8321928381919861, 'earbud'),
   (0.8034393191337585, 'ear'),
   (0.6846233010292053, 'microphones'),
   (0.6836632490158081, 'bud')]),
 ('televisions',
  [(0.7210134267807007, 'activision'),
   (0.7100964784622192, 'seahawks'),
   (0.7100086808204651, 'orleans'),
   (0.7087095975875854, 'lions'),
   (0.7044817805290222, 'cleveland')]),
 ('refrigerator',
  [(0.8487700819969177, 'side'),
   (0.8279436826705933, 'refriger'),
   (0.8248531222343445, 'monochromatic'),
   (0.7899876236915588, 'satina'),
   (0.7783686518669128, 'counter')]),
 ('washing machines',
  [(0.7989990711212158, 'washington'),
   (0.7803131341934204, 'ohio'),
   (0.7518882155418396, 'iowa'),
   (0.7500224113464355, 'georgia'),
   (0.7487159371376038, 'emachines')]),
 ('smartwatch',
  [(0.7698683142662048, 'watch'),
   (0.7587125301361084, 'smart'),
   (0.6855758428573608, 'heart'),
   (0.642951488494873, 'patch'),
   (0.6298967599868774, 'gigabit')]),
 ('sony',