In [1]:
import numpy as np
import pandas as pd

import re
from tqdm import tqdm

import collections

from sklearn.cluster import KMeans

from nltk.stem import WordNetLemmatizer  # For Lemmetization of words
from nltk.corpus import stopwords  # Load list of stopwords
from nltk import word_tokenize # Convert paragraph in tokens

import pickle
import sys

from gensim.models import word2vec # For represent words in vectors
import gensim

In [2]:

text_data = pd.read_csv("Precily_Text_Similarity.csv")
print("Shape of text_data : ", text_data.shape)
text_data.head(3)

Shape of text_data :  (3000, 2)


Unnamed: 0,text1,text2
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...


In [3]:
text_data.isnull().sum() # Check if text data have any null values

text1    0
text2    0
dtype: int64

In [4]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [7]:
import nltk


In [8]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to C:\Users\ganta karthik
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [9]:
# Combining all the above stundents 

preprocessed_text1 = []

# tqdm is for printing the status bar

for sentance in tqdm(text_data['text1'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)

    sent = ' '.join(e for e in sent.split() if e not in stopwords.words('english'))
    preprocessed_text1.append(sent.lower().strip())

100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [06:55<00:00,  7.22it/s]


In [10]:
# Merging preprocessed_text1 in text_data

text_data['text1'] = preprocessed_text1
text_data.head(3)

Unnamed: 0,text1,text2
0,broadband challenges tv viewing number europea...,gardener wins double in glasgow britain s jaso...
1,rap boss arrested drug find rap mogul marion s...,amnesty chief laments war failure the lack of ...
2,player burn worries robinson england coach and...,hanks greeted at wintry premiere hollywood sta...


In [11]:
# Combining all the above stundents 
from tqdm import tqdm
preprocessed_text2 = []

# tqdm is for printing the status bar
for sentance in tqdm(text_data['text2'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
   
    sent = ' '.join(e for e in sent.split() if e not in stopwords.words('english'))
    preprocessed_text2.append(sent.lower().strip())

100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [05:45<00:00,  8.68it/s]


In [12]:
# Merging preprocessed_text2 in text_data

text_data['text2'] = preprocessed_text2

text_data.head(3)

Unnamed: 0,text1,text2
0,broadband challenges tv viewing number europea...,gardener wins double glasgow britain jason gar...
1,rap boss arrested drug find rap mogul marion s...,amnesty chief laments war failure lack public ...
2,player burn worries robinson england coach and...,hanks greeted wintry premiere hollywood star t...


In [13]:
def word_tokenizer(text):
            #tokenizes and stems the text
            tokens = word_tokenize(text)
            lemmatizer = WordNetLemmatizer() 
            tokens = [lemmatizer.lemmatize(t) for t in tokens]
            return tokens

In [20]:
wordmodelfile="GoogleNews-vectors-negative300.bin.gz"
wordmodel= gensim.models.KeyedVectors.load_word2vec_format(wordmodelfile, binary=True)

In [22]:
import nltk

In [23]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\ganta karthik
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [25]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\ganta karthik
[nltk_data]     kumar\AppData\Roaming\nltk_data...


True

In [27]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to C:\Users\ganta karthik
[nltk_data]     kumar\AppData\Roaming\nltk_data...


True

In [35]:
similarity = []  # List to store similarity scores

for ind in text_data.index:
    s1 = text_data['text1'][ind]
    s2 = text_data['text2'][ind]

    if s1 == s2:
        similarity.append(0.0)  # 0 means highly similar
    else:
        s1words = word_tokenizer(s1)
        s2words = word_tokenizer(s2)

        vocab = wordmodel.key_to_index  # Use the key_to_index dictionary instead of vocab

        if len(s1words) == 0 or len(s2words) == 0:
            similarity.append(1.0)  # Consider two empty sentences as highly similar
        else:
            # Remove sentence words not found in the vocab
            s1words = [word for word in s1words if word in vocab]
            s2words = [word for word in s2words if word in vocab]

            # Compute and append the similarity score
            similarity_score = 1 - wordmodel.n_similarity(s1words, s2words)
            similarity.append(similarity_score)  # 1 means highly dissimilar & 0 means highly similar


In [38]:
import pandas as pd

# Assuming similarity is the list of similarity scores.
similarity = [...]  # Your list of similarity scores

# Create a new DataFrame with the similarity scores and assign a unique identifier as the index
final_score = pd.DataFrame({'Similarity_score': similarity})

# Display the first three rows of the final_score DataFrame
print(final_score.head(3))


  Similarity_score
0         Ellipsis


In [39]:
final_score.to_csv('final_score.csv',index=False)