# Exercise Sentence Similarity with word embeddings


### Let's go

we will use the moby-dick book from nltk gutenberg
and a couple of test sentences

We are building a classifier which classifies names into classes:
    male, female

In [2]:
import gensim
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import gutenberg
import numpy as np
from scipy import spatial
from operator import itemgetter
from nltk.corpus import stopwords
from pprint import pprint


In [3]:
def train_model(fileid):
    """
        training a gensim model, see also: https://radimrehurek.com/gensim/models/word2vec.html
    """
    return gensim.models.Word2Vec(gutenberg.sents(fileid), min_count=5, size=300, 
                                  workers=4, window=10, sg=1, negative=5, iter=5)


In [4]:
def avg_feature_vector(sentence, model, num_features, index2word_set):
    feature_vec = np.zeros((300, ), dtype='float32')
    n_words = 0
    for word in sentence:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec


In [5]:
def compute_similarity(sent1, sent2, model, index2word_set):
    s1_afv = avg_feature_vector(sent1, model=model, num_features=300, index2word_set=index2word_set)
    s2_afv = avg_feature_vector(sent2, model=model, num_features=300, index2word_set=index2word_set)
    sim = 1 - spatial.distance.cosine(s1_afv, s2_afv)
    return(sim)



In [6]:
def get_sim_sents(target, sents, model, index2word_set):

    print("Target sentence", target)

    res = []

    for i in range(len(sents)):
        similarity = compute_similarity(target, sents[i], model, index2word_set)
        res.append( (i, similarity) )

    return sorted(res,key=itemgetter(1), reverse=True)



In [7]:
fileid='melville-moby_dick.txt'
raw_sents = gutenberg.sents(fileid)

sents = []
for s in raw_sents:
    sent = [word for word in s if word.lower() not in stopwords.words('english')]
    sents.append(sent)
print(len(sents))

        

10059


In [9]:
model = train_model(fileid)
print(model.wv.most_similar(positive=['Ahab']))

index2word_set = set(model.wv.index2word)


[('Starbuck', 0.8346410989761353), ('captain', 0.8299128413200378), ('Stubb', 0.829764723777771), ('mate', 0.8275545835494995), ('Peleg', 0.8258814811706543), ('Flask', 0.8169136643409729), ('chief', 0.8151313066482544), ('Captain', 0.8103663921356201), ('voice', 0.8071380853652954), ('Guernsey', 0.7917664051055908)]


In [14]:
res = get_sim_sents(['Ahab', 'boat'], sents, model, index2word_set)
print(res[:10])
print()

for i in range(10):
    print(sents[res[i][0]], res[i])


Target sentence ['Ahab', 'boat']


  import sys


[(2034, 0.9045610427856445), (5602, 0.9045560359954834), (9348, 0.8859259486198425), (8557, 0.8800023198127747), (7543, 0.8781915903091431), (7633, 0.8777551651000977), (8588, 0.8763377070426941), (3636, 0.8760122060775757), (9446, 0.8752614855766296), (8140, 0.8733166456222534)]

['Meanwhile', 'Captain', 'Ahab', 'remained', 'invisibly', 'enshrined', 'within', 'cabin', '.'] (2034, 0.9045610427856445)
['demanded', 'Ahab', ',', 'boat', 'drifted', 'back', '.'] (5602, 0.9045560359954834)
['Ahab', 'turned', '.'] (9348, 0.8859259486198425)
['cried', 'Starbuck', 'crew', ',', 'suddenly', 'admonished', 'vigilance', 'vivid', 'lightning', 'darting', 'flambeaux', ',', 'light', 'Ahab', 'post', '.'] (8557, 0.8800023198127747)
['cried', 'Ahab', ',', 'suddenly', 'letting', 'suspended', 'breath', '.'] (7543, 0.8781915903091431)
['back', 'stranger', 'ship', ',', 'face', 'set', 'like', 'flint', ',', 'Ahab', 'stood', 'upright', 'till', 'alongside', 'Pequod', '.'] (7633, 0.8777551651000977)
['moment', 'Sta

In [15]:
res = get_sim_sents(['sperm', 'whale'], sents, model, index2word_set)
print(res[:10])
print()

for i in range(10):
    print(sents[res[i][0]], res[i])


Target sentence ['sperm', 'whale']


  import sys


[(9530, 0.9259545803070068), (2640, 0.9251436591148376), (2711, 0.9109616279602051), (2637, 0.8996851444244385), (2667, 0.8916807174682617), (2735, 0.8815829157829285), (6825, 0.8801481127738953), (5888, 0.8799620270729065), (2552, 0.8761884570121765), (2164, 0.8756001591682434)]

['*', 'motion', 'peculiar', 'sperm', 'whale', '.'] (9530, 0.9259545803070068)
['right', 'whale', 'elsewhere', 'treated', 'length', ',', 'reference', 'elucidating', 'sperm', 'whale', '.'] (2640, 0.9251436591148376)
['fishermen', 'approach', 'regarded', 'premonitory', 'advance', 'great', 'sperm', 'whale', '.'] (2711, 0.9109616279602051)
['pretend', 'see', 'difference', 'Greenland', 'whale', 'English', 'right', 'whale', 'Americans', '.'] (2637, 0.8996851444244385)
['Yet', 'seen', 'baleen', 'impossible', 'correctly', 'classify', 'Greenland', 'whale', '.'] (2667, 0.8916807174682617)
['Narwhale', 'heard', 'called', 'Tusked', 'whale', ',', 'Horned', 'whale', ',', 'Unicorn', 'whale', '.'] (2735, 0.8815829157829285)
[