### Document Similarity

In [None]:
!pip install -U -q gensim 

In [None]:
import re
import os
import codecs
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.casual import casual_tokenize

from sklearn import feature_extraction

import gensim

from IPython.display import display, Image
from IPython.core.interactiveshell import InteractiveShell

%matplotlib inline

Let's create some documents.

In [None]:
raw_documents = ["to open a bank account.",
                 "to pay in a cheque",
                 "to cash a cheque.",
                 "to transfer money.",
                 "to go into liquidation.",
                 "(for a company) to go into administration",
                 "to be in debt",
                 "to owe money (to someone)",
                 "to take out a loan",
                 "to insure something against fire/theft/accidental damage",
                 "to pay into a savings account/pension",
                 "to borrow money from someone",
                 "to pay by installments", 
                 "to lend money to someone",
                 "to invest in something/someone",
                 "to get a return on an investment",
                 "to change some money",
                 "to borrow money – to take money from someone that you will pay back later",
                 "to lend money – to give someone money that they will pay back later",
                ]



print("Number of documents:",len(raw_documents))

Number of documents: 19


We will use NLTK to tokenize.  
A document will now be a list of tokens.

In [None]:
gen_docs = [[w.lower() for w in casual_tokenize(text)] for text in raw_documents]

print(gen_docs)

[['to', 'open', 'a', 'bank', 'account', '.'], ['to', 'pay', 'in', 'a', 'cheque'], ['to', 'cash', 'a', 'cheque', '.'], ['to', 'transfer', 'money', '.'], ['to', 'go', 'into', 'liquidation', '.'], ['(', 'for', 'a', 'company', ')', 'to', 'go', 'into', 'administration'], ['to', 'be', 'in', 'debt'], ['to', 'owe', 'money', '(', 'to', 'someone', ')'], ['to', 'take', 'out', 'a', 'loan'], ['to', 'insure', 'something', 'against', 'fire', '/', 'theft', '/', 'accidental', 'damage'], ['to', 'pay', 'into', 'a', 'savings', 'account', '/', 'pension'], ['to', 'borrow', 'money', 'from', 'someone'], ['to', 'pay', 'by', 'installments'], ['to', 'lend', 'money', 'to', 'someone'], ['to', 'invest', 'in', 'something', '/', 'someone'], ['to', 'get', 'a', 'return', 'on', 'an', 'investment'], ['to', 'change', 'some', 'money'], ['to', 'borrow', 'money', '–', 'to', 'take', 'money', 'from', 'someone', 'that', 'you', 'will', 'pay', 'back', 'later'], ['to', 'lend', 'money', '–', 'to', 'give', 'someone', 'money', 'that'

We will create a dictionary from a list of documents.  
A dictionary maps every word to a number.

In [None]:
dictionary = gensim.corpora.Dictionary(gen_docs)
print(dictionary[5])
print(dictionary.token2id['money'])
print("Number of words in dictionary:",len(dictionary))

to
10
Number of words in dictionary: 58


In [None]:
for i in range(len(dictionary)):
    print(i, dictionary[i])

0 .
1 a
2 account
3 bank
4 open
5 to
6 cheque
7 in
8 pay
9 cash
10 money
11 transfer
12 go
13 into
14 liquidation
15 (
16 )
17 administration
18 company
19 for
20 be
21 debt
22 owe
23 someone
24 loan
25 out
26 take
27 /
28 accidental
29 against
30 damage
31 fire
32 insure
33 something
34 theft
35 pension
36 savings
37 borrow
38 from
39 by
40 installments
41 lend
42 invest
43 an
44 get
45 investment
46 on
47 return
48 change
49 some
50 back
51 later
52 that
53 will
54 you
55 –
56 give
57 they


Now we will create a corpus. A corpus is a list of bags of words.  
A bag-of-words representation for a document just lists the number of times each word occurs in the document.

In [None]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
for d in corpus:
    print(d)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]
[(1, 1), (5, 1), (6, 1), (7, 1), (8, 1)]
[(0, 1), (1, 1), (5, 1), (6, 1), (9, 1)]
[(0, 1), (5, 1), (10, 1), (11, 1)]
[(0, 1), (5, 1), (12, 1), (13, 1), (14, 1)]
[(1, 1), (5, 1), (12, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]
[(5, 1), (7, 1), (20, 1), (21, 1)]
[(5, 2), (10, 1), (15, 1), (16, 1), (22, 1), (23, 1)]
[(1, 1), (5, 1), (24, 1), (25, 1), (26, 1)]
[(5, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]
[(1, 1), (2, 1), (5, 1), (8, 1), (13, 1), (27, 1), (35, 1), (36, 1)]
[(5, 1), (10, 1), (23, 1), (37, 1), (38, 1)]
[(5, 1), (8, 1), (39, 1), (40, 1)]
[(5, 2), (10, 1), (23, 1), (41, 1)]
[(5, 1), (7, 1), (23, 1), (27, 1), (33, 1), (42, 1)]
[(1, 1), (5, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1)]
[(5, 1), (10, 1), (48, 1), (49, 1)]
[(5, 2), (8, 1), (10, 2), (23, 1), (26, 1), (37, 1), (38, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1)]
[(5, 2), (8, 1), (10, 2), (23, 1), (41, 1), 

Now we create a tf-idf model from the corpus.

In [None]:
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)

TfidfModel<num_docs=19, num_nnz=121>


In [None]:
for d in tf_idf[corpus]:
    print(d)

[(0, 0.3065656160153156), (1, 0.19646097182933306), (2, 0.4429426184684284), (3, 0.5793196209215412), (4, 0.5793196209215412)]
[(1, 0.2976418942353754), (6, 0.6710660075175084), (7, 0.5502047973206687), (8, 0.39793768024127935)]
[(0, 0.3761077648158145), (1, 0.24102669421538023), (6, 0.5434208843743974), (9, 0.7107340039329801)]
[(0, 0.44803515490559687), (10, 0.2871209860079739), (11, 0.8466557974540344)]
[(0, 0.35219232005009604), (12, 0.5088665535135118), (13, 0.41721800210818183), (14, 0.6655407869769275)]
[(1, 0.14783978813511728), (12, 0.3333208741697226), (13, 0.2732886809357804), (15, 0.3333208741697226), (16, 0.3333208741697226), (17, 0.4359465863477392), (18, 0.4359465863477392), (19, 0.4359465863477392)]
[(7, 0.40524560474105453), (20, 0.6464425727928418), (21, 0.6464425727928418)]
[(10, 0.21721483485220752), (15, 0.48973445882780997), (16, 0.48973445882780997), (22, 0.6405181375893392), (23, 0.2507479822700011)]
[(1, 0.2063991903606659), (24, 0.6086252124525078), (25, 0.608

In [None]:
sims = gensim.similarities.Similarity("/tmp/",
                                      tf_idf[corpus],
                                      num_features=len(dictionary))
print(type(sims))
print(sims)

<class 'gensim.similarities.docsim.Similarity'>
Similarity<19 documents in 0 shards stored under /tmp/>


Now create a query document and convert it to tf-idf.

In [None]:
query_doc = [w.lower() for w in casual_tokenize("I’d like to open a savings account")]
print(query_doc)

['i', '’', 'd', 'like', 'to', 'open', 'a', 'savings', 'account']


In [None]:
query_doc_bow = dictionary.doc2bow(query_doc)
print(query_doc_bow)

[(1, 1), (2, 1), (4, 1), (5, 1), (36, 1)]


We show an array of document similarities to query.

In [None]:
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)

[(1, 0.2063991903606659), (2, 0.4653494125415236), (4, 0.6086252124525078), (36, 0.6086252124525078)]


In [None]:
sims[query_doc_tf_idf]

Out[17]: array([0.599261  , 0.06143305, 0.04974772, 0.        , 0.        ,
       0.03051401, 0.        , 0.        , 0.04260062, 0.        ,
       0.5384151 , 0.        , 0.        , 0.        , 0.        ,
       0.03094874, 0.        , 0.        , 0.        ], dtype=float32)

Printing the most similar Document

In [None]:
print(raw_documents[np.argmax(sims[query_doc_tf_idf])])

to open a bank account.
