In [11]:
sentences = ["It was the best of times",
             "it was the worst of times",
             "it was the age of wisdom",
             "it was the age of foolishness"]

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]
print(tokenized_sentences)

[['It', 'was', 'the', 'best', 'of', 'times'], ['it', 'was', 'the', 'worst', 'of', 'times'], ['it', 'was', 'the', 'age', 'of', 'wisdom'], ['it', 'was', 'the', 'age', 'of', 'foolishness']]


In [16]:
vocabulary = set([w for s in tokenized_sentences for w in s])
print(vocabulary)

{'was', 'the', 'age', 'times', 'It', 'best', 'of', 'foolishness', 'worst', 'wisdom', 'it'}


In [17]:
import pandas as pd
pd.DataFrame([[w,i] for i,w in enumerate(vocabulary)])

Unnamed: 0,0,1
0,was,0
1,the,1
2,age,2
3,times,3
4,It,4
5,best,5
6,of,6
7,foolishness,7
8,worst,8
9,wisdom,9


In [22]:
def one_hot(tokens):
    return [1 if w in tokens else 0 for w in vocabulary]

onehot = [one_hot(tokenized_sentence) for tokenized_sentence in tokenized_sentences]

for (i,w) in zip(sentences,onehot):
    print("%s: %s" % (w,i))

[1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0]: It was the best of times
[1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1]: it was the worst of times
[1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1]: it was the age of wisdom
[1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1]: it was the age of foolishness


In [23]:
pd.DataFrame(onehot, columns=vocabulary)   # Document Term Matrix. 

Unnamed: 0,was,the,age,times,It,best,of,foolishness,worst,wisdom,it
0,1,1,0,1,1,1,1,0,0,0,0
1,1,1,0,1,0,0,1,0,1,0,1
2,1,1,1,0,0,0,1,0,0,1,1
3,1,1,1,0,0,0,1,1,0,0,1


In [30]:
import numpy as np
np.dot(onehot[0],onehot[1])    # Calculating the similarity 

4

In [33]:
np.dot(onehot,np.transpose(onehot))

array([[6, 4, 3, 3],
       [4, 6, 4, 4],
       [3, 4, 6, 5],
       [3, 4, 5, 6]])

In [39]:
more_sentences = sentences+ ["I Like to Watch Movies. Akash likes movies too.","I like to watch cricket too."]

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

cv.fit(more_sentences)

In [43]:
print(cv.get_feature_names())

['age', 'akash', 'best', 'cricket', 'foolishness', 'it', 'like', 'likes', 'movies', 'of', 'the', 'times', 'to', 'too', 'was', 'watch', 'wisdom', 'worst']




In [45]:
dt = cv.transform(more_sentences)

In [46]:
pd.DataFrame(dt.toarray(), columns= cv.get_feature_names())   # Getting the Document-Term Matrix.

Unnamed: 0,age,akash,best,cricket,foolishness,it,like,likes,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,1,0,0,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,1,0,0,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,1,0
3,1,0,0,0,1,1,0,0,0,1,1,0,0,0,1,0,0,0
4,0,1,0,0,0,0,1,1,2,0,0,0,1,1,0,1,0,0
5,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0


In [50]:
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity(dt,dt))    # Similarity Matrix

[[1.         0.83333333 0.66666667 0.66666667 0.         0.        ]
 [0.83333333 1.         0.66666667 0.66666667 0.         0.        ]
 [0.66666667 0.66666667 1.         0.83333333 0.         0.        ]
 [0.66666667 0.66666667 0.83333333 1.         0.         0.        ]
 [0.         0.         0.         0.         1.         0.56568542]
 [0.         0.         0.         0.         0.56568542 1.        ]]


In [53]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
tfidf_df = tfidf.fit_transform(dt)

In [55]:
pd.DataFrame(tfidf_df.toarray(), columns=cv.get_feature_names())



Unnamed: 0,age,akash,best,cricket,foolishness,it,like,likes,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.56978,0.0,0.0,0.338027,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.338027,0.338027,0.467228,0.0,0.0,0.338027,0.0,0.0,0.56978
2,0.467228,0.0,0.0,0.0,0.0,0.338027,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.56978,0.0
3,0.467228,0.0,0.0,0.0,0.56978,0.338027,0.0,0.0,0.0,0.338027,0.338027,0.0,0.0,0.0,0.338027,0.0,0.0,0.0
4,0.0,0.339233,0.0,0.0,0.0,0.0,0.278176,0.339233,0.678465,0.0,0.0,0.0,0.278176,0.278176,0.0,0.278176,0.0,0.0
5,0.0,0.0,0.0,0.520601,0.0,0.0,0.4269,0.0,0.0,0.0,0.0,0.0,0.4269,0.4269,0.0,0.4269,0.0,0.0


In [56]:
pd.DataFrame(cosine_similarity(tfidf_df,tfidf_df))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.675351,0.457049,0.457049,0.0,0.0
1,0.675351,1.0,0.457049,0.457049,0.0,0.0
2,0.457049,0.457049,1.0,0.675351,0.0,0.0
3,0.457049,0.457049,0.675351,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.475013
5,0.0,0.0,0.0,0.0,0.475013,1.0
