In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [6]:
# Define the documents

doc1 = "Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin"

doc2 = "President Trump says Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election"

doc3 = "Post elections, Vladimir Putin became President of Russia. President Putin had served as the Prime Minister earlier in his political career"


In [7]:
documents = [doc1, doc2, doc3]

In [11]:
# Create the Document Term Matrix

tfidf = TfidfVectorizer(stop_words='english')
sparse_matrix = tfidf.fit_transform(documents)

doc_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_matrix, columns = tfidf.get_feature_names(),
                  index = ['doc1', 'doc2', 'doc3'])
df

Unnamed: 0,career,claimed,earlier,election,elections,friend,friends,interference,lost,minister,mr,outcome,parties,political,post,president,prime,putin,republican,russia,says,served,support,trump,vladimir,winning,witchhunt
doc1,0.0,0.0,0.0,0.203368,0.0,0.0,0.53481,0.0,0.267405,0.0,0.267405,0.0,0.0,0.157934,0.0,0.315867,0.0,0.157934,0.267405,0.0,0.0,0.0,0.267405,0.406737,0.0,0.267405,0.0
doc2,0.0,0.241982,0.0,0.368067,0.0,0.241982,0.0,0.241982,0.0,0.0,0.0,0.241982,0.241982,0.285837,0.0,0.285837,0.0,0.285837,0.0,0.0,0.483963,0.0,0.0,0.184033,0.0,0.0,0.241982
doc3,0.287012,0.0,0.287012,0.0,0.287012,0.0,0.0,0.0,0.0,0.287012,0.0,0.0,0.0,0.169514,0.287012,0.339028,0.287012,0.339028,0.0,0.287012,0.0,0.287012,0.0,0.0,0.287012,0.0,0.0


In [12]:
# Compute cosine similarity

from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df, df))

[[1.         0.33027897 0.18740386]
 [0.33027897 1.         0.24226661]
 [0.18740386 0.24226661 1.        ]]


In [13]:
doc4 = "Soup is a primarily liquid food, generally served warm or hot (but may be cool or cold), that is made by combining ingredients of meat or vegetables with stock, juice, water, or another liquid. "

doc5 = "Noodles are a staple food in many cultures. They are made from unleavened dough which is stretched, extruded, or rolled flat and cut into one of a variety of shapes."

doc6 = "Dosa is a type of pancake from the Indian subcontinent, made from a fermented batter. It is somewhat similar to a crepe in appearance. Its main ingredients are rice and black gram."


In [14]:
documents = [doc1, doc2, doc3, doc4, doc5, doc6]

Suppose if you have another set of documents on a completely different topic, say ‘food’, you want a similarity metric that gives higher scores for documents belonging to the same topic and lower scores when comparing docs from different topics.

we need to consider the semantic meaning should be considered. That is, words similar in meaning should be treated as similar. 

For this, converting the words into respective word vectors, and then, computing the similarities can address this problem.

SO,to get the word vectors we need a word embedding model.
We can use FastText from Gensim.

In [17]:
import gensim
from gensim.matutils import softcossim
from gensim import corpora
import gensim.downloader as api
from gensim.utils import simple_preprocess

In [18]:
# Downlaod the fasttext model

fasttext = api.load('fasttext-wiki-news-subwords-300')



In [19]:
dictionary = corpora.Dictionary([simple_preprocess(doc) for doc in documents])

similarity_matrix = fasttext.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100)

In [22]:
sent1 = dictionary.doc2bow(simple_preprocess(doc1))
sent2 = dictionary.doc2bow(simple_preprocess(doc2))
sent3 = dictionary.doc2bow(simple_preprocess(doc3))
sent4 = dictionary.doc2bow(simple_preprocess(doc4))
sent5 = dictionary.doc2bow(simple_preprocess(doc5))
sent6 = dictionary.doc2bow(simple_preprocess(doc6))

In [23]:
sentences = [sent1, sent2, sent3, sent4, sent5, sent6]

In [24]:
print(softcossim(sent1, sent2, similarity_matrix))

0.5842470477718544


In [25]:
import numpy as np
import pandas as pd

array_len = np.arange(len(sentences))
xx, yy = np.meshgrid(array_len, array_len)
cossim_mat = pd.DataFrame([[round(softcossim(sentences[i], sentences[j], similarity_matrix),2) for i , j in zip(x, y)] for y, x in zip(xx, yy)])
cossim_mat

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.58,0.56,0.28,0.34,0.4
1,0.58,1.0,0.54,0.25,0.31,0.43
2,0.56,0.54,1.0,0.19,0.25,0.36
3,0.28,0.25,0.19,1.0,0.5,0.38
4,0.34,0.31,0.25,0.5,1.0,0.56
5,0.4,0.43,0.36,0.38,0.56,1.0
