In [None]:
from gensim import models
from nltk.corpus import stopwords
from database import Database
import re
import pickle
import numpy
from collections import OrderedDict
import string
import io
from gensim.models.wrappers import FastText
from nltk.stem import PorterStemmer
import numpy

In [None]:
class Helper:
    """Helper class for text pre-processing tasks."""
    p = re.compile(r'[^\w\s]')
    
    @classmethod
    def sanitise_corpus(cls, corpus):
        sanitised_corpus = []   
        for raw_sentence in corpus:
            filtered_words = [word for word in raw_sentence.split() if word not in stopwords.words('english')]
            sanitised_words = [cls.p.sub('', word) for word in filtered_words]
            sanitised_corpus.append(list(set(filter(None, sanitised_words))))
        return sanitised_corpus

# Loading event data for model evaluation
event_data = Database.get_instance().list_companies_by_event('ijl_18')
event_data = [str(c['summary']).lower() for c in event_data]
event_data = Helper.sanitise_corpus(event_data)


# Here, I am using Word2vec and FastText model wrappers provided by Gensim. 
# Gensim ported the original C++ Word2vec or FastText library into python.

# Initialising Word2Vec model
word2vec_model = models.Word2Vec(size=300, window=10, min_count=1, workers=4, hs=1, sg=1)
# Building model vocabulary using words from event data
word2vec_model.build_vocab(event_data)
# Training Word2Vec model and presisting it in memory
word2vec_model.train(event_data, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.iter)

# Initialising Fasttext model
fasttext_model = models.FastText(size=300, window=10, min_count=1, workers=4, hs=1, sg=1)
# Building model vocabulary using words from event data
fasttext_model.build_vocab(event_data)
# Training Word2Vec model and presisting it in memory
fasttext_model.train(event_data, total_examples=fasttext_model.corpus_count, epochs=fasttext_model.iter)

"""
The word vectors are stored in a KeyedVectors instance in *_model.wv. 
This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec and Fasttext
To save memory we will grab KeyVector instance and clear original models presisted in memory.
""""
word2vec_model_wv = word2vec_model.wv
del word2vec_model
fasttext_model_wv = fasttext_model.wv
del fasttext_model

In [20]:
# Getting vector of a word using word2vec model
word2vec_model_wv['diamond'] # return numpy vector of a word

array([ 2.48286307e-01, -5.64355180e-02,  4.46890742e-02, -4.88192327e-02,
       -1.98240116e-01,  3.49513665e-02, -2.98750512e-02,  4.62635532e-02,
       -1.49095133e-01, -5.30804619e-02,  7.88651481e-02,  1.57252297e-01,
       -2.35441312e-01,  1.71836503e-02,  2.62964442e-02, -1.01933852e-01,
        1.71097308e-01,  1.13690436e-01,  9.75958407e-02, -2.38527442e-04,
       -2.36815140e-02,  8.89937114e-03,  1.60161391e-01, -1.84518039e-01,
       -4.57111560e-02, -1.30511925e-01,  1.15983278e-01,  2.81220134e-02,
       -1.54188335e-01,  2.03590333e-01, -1.31132044e-02, -3.71631682e-02,
        1.10348044e-02,  2.56354511e-02,  1.30372182e-01, -6.34903163e-02,
       -2.36494374e-03, -1.51552960e-01,  1.04012601e-02, -2.24542208e-02,
        9.31991190e-02, -1.48600459e-01,  1.18603028e-01,  1.06553361e-01,
       -4.59983461e-02,  4.40131612e-02, -1.02719478e-01, -3.17772180e-02,
        1.07664071e-01,  9.26099569e-02,  1.06270544e-01, -1.25118792e-01,
       -9.16944966e-02,  

In [21]:
# Getting similar words using word2vec
word2vec_model_wv.most_similar(['diamond']) # this could also be used to generate relevant tags

[('finest', 0.9987034797668457),
 ('since', 0.9986259341239929),
 ('old', 0.9983115196228027),
 ('manufacturer', 0.9981980919837952),
 ('etc', 0.9978218078613281),
 ('vintage', 0.9977965354919434),
 ('gemstone', 0.9977301359176636),
 ('visit', 0.9976707100868225),
 ('specialize', 0.9976336359977722),
 ('sourced', 0.997613787651062)]

In [23]:
# Getting the similary score between 2 words
word2vec_model_wv.similarity('ring', 'necklace')

0.9968213082781366

In [24]:
"""
However, word2vec model is not best for unseen words
for example if we try to find the similary score for the word that was not in training set,
it will break.
"""
word2vec_model_wv.most_similar('microsoft')

KeyError: "word 'microsoft' not in vocabulary"

In [26]:
"""
fasttext comes handy in this case
fasttext is provides an optimised implementatioln of skip-gram and cbow algorithms.
for the words not present in vocabulary it trys to break it down to revelant n-grams.
"""
fasttext_model_wv.most_similar('microsoft') 

[('studded', 0.9993187785148621),
 ('maintained', 0.9993143081665039),
 ('german', 0.9993093609809875),
 ('pursuit', 0.9993017911911011),
 ('japanese', 0.999301552772522),
 ('male', 0.9992967247962952),
 ('folds', 0.9992942214012146),
 ('oneoff', 0.9992929697036743),
 ('cuban', 0.9992919564247131),
 ('ullmann', 0.9992835521697998)]

In [33]:
"""
we know that word 'microsoft' is not within the training set, therefore the most similar words returned 
are at the far end of the vector space (with no context).
On the experimentation side, fasttext could also be used to find sectence similarity (like Ollie)
"""
event_data = Database.get_instance().list_companies_by_event('ijl_18')
event_data = [str(c['summary']).lower() for c in event_data]
tag_similarity = []
for i in range(len(event_data[:5])):
    new_match = {}
    new_match['summary'] = corpus_raw[i]
    new_match['match'] = fasttext_model_wv.n_similarity('traditional handmade diamonds'.lower().split(),
                                               corpus_raw[i].lower().split())
    tag_similarity.append(new_match)
    tag_similarity = sorted(tag_similarity, key=lambda k: k['match'])
    tag_similarity.reverse()
print(tag_similarity[:2])

[{'summary': 'amber hall jewellery are an amber wholesalers who are truly passionate abour bringing you contemporary silver and gold, and baltic amber set jewellery. we also stock semi precious stone set and plain silver pieces in both traditional and more contemporary designs.', 'match': 0.9992067313422733}, {'summary': 'manufacturers of natural fancy color diamond and bridal jewelry. our natural fancy color diamond collection boasts an incredible array of blue, pink, yellow, green, and multicolor diamond pieces, documented with g.i.a. certificates. we also carry an a line of bridal jewelry for any sizes, shapes of colored and white diamonds.', 'match': 0.9989219551041687}]


In [34]:
"""
According to Tomas Mikolov Word2Vec embedding has many advantages 
compared to earlier algorithms such as latent semantic analysis.
Sources for reading:
 - https://fasttext.cc/docs/en/support.html
 - https://www.tensorflow.org/tutorials/word2vec
 - http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
"""

In [39]:
# On experimental side, I download pretrained vectors by Google and explore more about word2vec wrapper in gensim.
google_vectors = 'GoogleNews-vectors-negative300.bin'

pretrained_model = models.KeyedVectors.load_word2vec_format(google_vectors, binary=True)
print(pretrained_model)

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x112143780>


In [40]:
# this is just to clear some space in momery
model = pretrained_model.wv
del pretrained_model
model

  


<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x112143780>

In [41]:
model.vocab # a preview of model vocabulary

{'</s>': <gensim.models.keyedvectors.Vocab at 0x112143cc0>,
 'in': <gensim.models.keyedvectors.Vocab at 0x1121435c0>,
 'for': <gensim.models.keyedvectors.Vocab at 0x112143828>,
 'that': <gensim.models.keyedvectors.Vocab at 0x1121435f8>,
 'is': <gensim.models.keyedvectors.Vocab at 0x112143a58>,
 'on': <gensim.models.keyedvectors.Vocab at 0x112143160>,
 '##': <gensim.models.keyedvectors.Vocab at 0x1121432e8>,
 'The': <gensim.models.keyedvectors.Vocab at 0x1121434a8>,
 'with': <gensim.models.keyedvectors.Vocab at 0x1121433c8>,
 'said': <gensim.models.keyedvectors.Vocab at 0x112143f60>,
 'was': <gensim.models.keyedvectors.Vocab at 0x112143ef0>,
 'the': <gensim.models.keyedvectors.Vocab at 0x112143c50>,
 'at': <gensim.models.keyedvectors.Vocab at 0x112143c88>,
 'not': <gensim.models.keyedvectors.Vocab at 0x1121431d0>,
 'as': <gensim.models.keyedvectors.Vocab at 0x112147828>,
 'it': <gensim.models.keyedvectors.Vocab at 0x112147b70>,
 'be': <gensim.models.keyedvectors.Vocab at 0x1121474a8>,
 

In [43]:
print(model.most_similar('microsoft'))

[('adobe_photoshop', 0.8042364716529846), ('microsoft_office', 0.7978680729866028), ('windows_xp', 0.7926486134529114), ('buy_microsoft', 0.7902629375457764), ('cs4', 0.7494896650314331), ('autocad', 0.7432770729064941), ('photoshop', 0.7404437065124512), ('windows_vista', 0.7382057309150696), ('quickbooks', 0.7320874929428101), ('adobe_photoshop_cs4', 0.7269179821014404)]


In [44]:
"""
Transform learning mechanism could be applied to extend the knowledge base of custom word2vec model
potential use cases using Google's pretrained model are:
 - auto correct spellings
 - predict next word in the scentence
The use case above can be used in pre-processing stage to transform the user input 
closer to the context of trained model.
"""

In [71]:
# Prediciting output word using gensim word2vec wrapper
training_questions = [
    "i want_to buy diamond jewellery",
    "handcrafted jewellery is good",
    "expensive diamonds are often handcrafted",
    "some exhibitors sell hand_made rings",
    "people are minions and they love diamond"
]
training_questions = [word.split() for word in training_questions]
# Initialising Word2Vec model
model = models.Word2Vec(size=50, window=10, min_count=1, workers=4, hs=1, sg=1)
# Building model vocabulary using words from event data
model.build_vocab(training_questions)
# Training Word2Vec model and presisting it in memory
model.train(training_questions, total_examples=model.corpus_count, epochs=model.iter)

# Report the probability distribution of the center word given the context words as input to the trained model.
model.predict_output_word('i want to sell'.split(), topn=1)

  from ipykernel import kernelapp as app


[('jewellery', 0.04545477)]

In [72]:
# Auto correction using Google's pretrained model
model = models.KeyedVectors.load_word2vec_format(google_vectors, binary=True)

words = model.index2word

w_rank = {}
for i,word in enumerate(words):
    w_rank[word] = i

WORDS = w_rank

In [73]:
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

def P(word): 
    "Probability of `word`."
    # use inverse of rank as proxy
    # returns 0 if the word isn't in the dictionary
    return - WORDS.get(word, 0)

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

print(correction('dimond'))
print(correction('androd'))

diamond
android


In [None]:
"""
Conclusion:
Word2Vec is great for word embeddings and it also comes handy before taking the words into embedding stage.
By using public vector sets like GoogleKeyedVectors, we can improve the preprocessing stage. The input
sentence can be parsed as according top the context of training set. For example, if the training set is all about
Jewellery expo and the user passed request something like -> 'i am looking to auto cad software' we can try to
parse this at preprocessing stage which would mean that the user might be looking for auto cad software to 
design Jewellery. On the top of that the pretrained vectors can be used to build micro models which can be exposed as
APIs for spelling check, auto corrections, suggesting next word, etc.
"""