# Import Libraries:

In [1]:
import numpy as np
import math
import re
from numpy.linalg import norm
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
def similarity(s1,s2): 
    """
    This function takes vector forms of sentences and calculates cosine similarity [0-1]
    
    """
    
    product = np.dot(s1,s2) # dot product between vectors of two sentences
    norm_s1 = norm(s1) # calculate norm of A
    norm_s2 = norm(s2) # calculate norm of B
    return product/(norm_s1*norm_s2) # cosine similarity = Dot product/(Norm(A) * Norm(B))

# Word2Vec

In [3]:
from gensim.models import KeyedVectors
from gensim import models

In [4]:
# Using the pretrained W2V model, trained on Google news data:

w2v_model=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
def get_w2v(sentence):
    """
    Given a sentence this function returns Average Word2Vec 
    
    """
    
    sent_vector = np.zeros(300) # since W2v gives 300 dimension representation for each word
    for word in sentence.split(): # taking tokens from the sentence
        vector = w2v_model[word] # getting w2v vector for the word
        sent_vector += vector # adding this vector for performing Average W2v
    sent_vector_final = sent_vector/len(sentence.split()) # performing the final mean operation
    return sent_vector_final

In [6]:
def final_score(sentence1,sentence2):
    """
    This function takes two raw sentences, converts them into (W2V) vectors and returns the similarity score
    
    """
    s1 = get_w2v(sentence1)
    s2 = get_w2v(sentence2)
    
    score = similarity(s1,s2)
    return score

In [7]:
s1 = "girl is playing harp"
s2 =  "girl is playing keyboard"

print(s1)
print("*"*100)
print(s2)
print("*"*100)

girl is playing harp
****************************************************************************************************
girl is playing keyboard
****************************************************************************************************


In [8]:
print("Similarity score through Average (W2V) :",final_score(s1,s2))

Similarity score through Average (W2V) : 0.8422383754578019


# SentenceTransformer:

In [9]:
from sentence_transformers import SentenceTransformer

In [10]:
model_name = 'bert-base-nli-mean-tokens' # using the MEAN pooling strategy for CLS tokens

In [11]:
model = SentenceTransformer(model_name)

# Testing

In [12]:
testing = ["Last night I was studying the whole night because of upcoming exam",
          "Last night I was partying the whole night because of postponed exam",
          "As exams are approaching I must study at long stretch at night"]

tut_vectors = model.encode(testing)

In [13]:
tut_vectors.shape

(3, 768)

In [14]:
similarity(tut_vectors[0],tut_vectors[1]) # similarity between sentence (1) and (2)

0.7606491

In [15]:
similarity(tut_vectors[0],tut_vectors[2]) # similarity between sentence (1) and (3)

0.69074416

In [16]:
similarity(tut_vectors[1],tut_vectors[2]) # similarity between sentence (2) and (3)

0.5401103

In [17]:
lst_bert = ["A girl is playing a harp","A girl is playing a keyboard"]
bert_vectors = model.encode(lst_bert)
bert_vectors.shape

(2, 768)

In [18]:
print("Similarity score through (Sentence_Transformer) :",similarity(bert_vectors[0],bert_vectors[1]))

Similarity score through (Sentence_Transformer) : 0.583888


In [19]:
def sentence_similarity(text_1,text_2):
    """
    This function converts the input text to vectors of 768 dimensions and finds similarity between them.
    
    """
    
    vector_1 = model.encode(text_1)
    vector_2 = model.encode(text_2)
    sim_score = similarity(vector_1,vector_2)
    
    return sim_score

In [20]:
text_1 = "A group of men play soccer on the beach."
text_2 = "A group of boys are playing soccer on the beach."
print(text_1)
print("*"*125)
print(text_2)
print("*"*125)
print("Similarity Score :",sentence_similarity(text_1,text_2))

A group of men play soccer on the beach.
*****************************************************************************************************************************
A group of boys are playing soccer on the beach.
*****************************************************************************************************************************
Similarity Score : 0.97384673
