In [15]:
'''
Basic cosine similarity metric example.

In this example, we first build a representation lexicon, and then, 
for each sentence, we compute the cosine similarity with respect to the others.
The current implementation is not efficient but is easy do understand. For
real applications replace it by one that uses, for example, a defaultdict data
structure.

Author: Fabrício Galende M. de Carvalho, DSc
'''

import numpy as np
import pandas as pd

sentences = ["gostaria de obter ajuda e essa ajuda é importante",
             "acho que vai chover e está nublado",
             "não acho que vai chover hoje pois faz sol",
             "como será que posso obter ajuda",
             "não consigo obter ajuda de jeito nenhum"]

def build_representation_lexicon(sentences, representation_lexicon):
    for sentence in sentences:
        word_tokens = sentence.split() #in real application, replace by preprocessing tokenizer output
        for word in word_tokens:
            if word not in representation_lexicon:
               representation_lexicon.append(word)

def build_feature_vector(sentence, representation_lexicon):
    feature_vector = np.zeros(len(representation_lexicon))
    for pos in range(len(representation_lexicon)):
        for word_s in sentence.split():
            if word_s == representation_lexicon[pos]:
                feature_vector[pos] +=1
    return feature_vector

representation_lexicon = []
build_representation_lexicon(sentences, representation_lexicon)

feature_vectors=[]
sentence_labels = []
for i, sentence in enumerate(sentences):
    feature_vectors.append(build_feature_vector(sentence,representation_lexicon))
    sentence_labels.append("sent_"+str(i))

print("Feature vectors corresponding to each sentence: ")
for feature_vector in feature_vectors:
    print(feature_vector)
print("\n")

similarity_matrix = []
for x in range(len(feature_vectors)):
    similarity_matrix.append([])
    for y in range(len(feature_vectors)):
        similarity_matrix[x].append( np.dot(feature_vectors[x],
                                            feature_vectors[y])/ (np.linalg.norm(feature_vectors[x])*np.linalg.norm(feature_vectors[y]) ))

similarity_matrix_df = pd.DataFrame(data=similarity_matrix, index =sentence_labels, columns= sentence_labels)
print("Similarity cosine matrix: \n")
print( similarity_matrix_df)
print("\n")

print("Similarity cosine between sentence 1 and sentence 2: ", similarity_matrix_df.loc["sent_1", "sent_2"])

Feature vectors corresponding to each sentence: 
[1. 1. 1. 2. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0.
 0.]
[0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0.
 0.]
[0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1.
 1.]


Similarity cosine matrix: 

          sent_0    sent_1    sent_2    sent_3    sent_4
sent_0  1.000000  0.113961  0.000000  0.369274  0.455842
sent_1  0.113961  1.000000  0.503953  0.154303  0.000000
sent_2  0.000000  0.503953  1.000000  0.136083  0.125988
sent_3  0.369274  0.154303  0.136083  1.000000  0.308607
sent_4  0.455842  0.000000  0.125988  0.308607  1.000000


Similarity cosine between sentence 1 and sentence 2:  0.5039526306789696
