**Mount Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Import Libraries**

In [None]:
import os
import sys
import numpy as np
from numpy import mean,std
import pandas as pd
import matplotlib.pyplot as plt
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec
from nltk import sent_tokenize
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
import tensorflow as tf
import tensorflow_hub as hub
!pip install tensorflow_text
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer

#**Code**

**TF-IDF**

In [None]:
def process_tfidf_similarity(x_base, y_comp):
    vectorizer = TfidfVectorizer()
    # To make uniformed vectors, both documents need to be combined first.
    #y_comp.insert(0, x_base)
    res = x_base + y_comp
    embeddings = vectorizer.fit_transform(res)
    cosine_similarities = cosine_similarity(embeddings[0:1], embeddings[1:]).flatten()

    res_str = x_base + y_comp
    embeddings = vectorizer.fit_transform(res_str)
    cosine_similarities = cosine_similarity(embeddings[0:1], embeddings[1:]).flatten()
    highest_score = 0
    highest_score_index = 0
    for i, score in enumerate(cosine_similarities):
      if highest_score < score:
        highest_score = score
        highest_score_index = i

    most_similar_document = y_comp[highest_score_index]

    #Thresholding 0.05
    if highest_score >= 0.05:
      return 1
    else:
      return 0

**Jaccard**

In [None]:
def preprocess(text):
	#lowered = str.lower(text)
	lowered = str(text)
	stop_words = set(stopwords.words('english'))
	word_tokens = word_tokenize(lowered)

	words = []
	for w in word_tokens:
		if w not in stop_words:
			if w not in string.punctuation:
				if len(w) > 1:
					lemmatized = lemmatizer.lemmatize(w)
					words.append(lemmatized)
	return words

def calculate_jaccard(word_tokens1, word_tokens2):
	# Combine both tokens to find union.
	both_tokens = word_tokens1 + word_tokens2
	union = set(both_tokens)

	# Calculate intersection.
	intersection = set()
	for w in word_tokens1:
		if w in word_tokens2:
			intersection.add(w)

	jaccard_score = len(intersection)/len(union)
	return jaccard_score

def process_jaccard_similarity(base_document, documents):

	# Tokenize the base document we are comparing against.
	base_tokens = preprocess(base_document)

	# Tokenize each document
	all_tokens = []
	for i, document in enumerate(documents):
		tokens = preprocess(document)
		all_tokens.append(tokens)

		#print("making word tokens at index:", i)

	all_scores = []
	for tokens in all_tokens:
		score = calculate_jaccard(base_tokens, tokens)

		all_scores.append(score)

	highest_score = 0
	highest_score_index = 0
	for i, score in enumerate(all_scores):
		if highest_score < score:
			highest_score = score
			highest_score_index = i

	most_similar_document = documents[highest_score_index]

	#Thresholding 0.05
	if highest_score >= 0.05:
		return 1
	else:
		return 0

**Doc2Vec**

In [None]:
def preprocess(text):
	#lowered = str.lower(text)
	lowered = str(text)
	stop_words = set(stopwords.words('english'))
	word_tokens = word_tokenize(lowered)

	words = []
	for w in word_tokens:
		if w not in stop_words:
			if w not in string.punctuation:
				if len(w) > 1:
					lemmatized = lemmatizer.lemmatize(w)
					words.append(lemmatized)

	return words

def process_doc2vec_similarity(base_document, documents):
	filename = ''
	model= Doc2Vec.load(filename)
	tokens = preprocess(base_document)

	# Only handle words that appear in the doc2vec pretrained vectors. enwiki_ebow model contains 669549 vocabulary size.
	tokens = list(filter(lambda x: x in model.wv.vocab.keys(), tokens))
	base_vector = model.infer_vector(tokens)

	vectors = []
	for i, document in enumerate(documents):

		tokens = preprocess(document)
		tokens = list(filter(lambda x: x in model.wv.vocab.keys(), tokens))
		vector = model.infer_vector(tokens)
		vectors.append(vector)

		#print("making vector at index:", i)

	scores = cosine_similarity([base_vector], vectors).flatten()

	highest_score = 0
	highest_score_index = 0
	for i, score in enumerate(scores):
		if highest_score < score:
			highest_score = score
			highest_score_index = i

	most_similar_document = documents[highest_score_index]

	#Thresholding 0.50
	if highest_score >= 0.50:
		return 1
	else:
		return 0

**BERT**

In [None]:
def process_bert_similarity(base_document, documents):
	# This will download and load the pretrained model offered by UKPLab.
	model = SentenceTransformer('bert-base-nli-mean-tokens')

	# Although it is not explicitly stated in the official document of sentence transformer, the original BERT is meant for a shorter sentence. We will feed the model by sentences instead of the whole documents.
	sentences = sent_tokenize(str(base_document))
	base_embeddings_sentences = model.encode(sentences)
	base_embeddings = np.mean(np.array(base_embeddings_sentences), axis=0)

	vectors = []
	for i, document in enumerate(documents):
		sentences = sent_tokenize(str(document))
		embeddings_sentences = model.encode(sentences)
		embeddings = np.mean(np.array(embeddings_sentences), axis=0)
		vectors.append(embeddings)
		#print("making vector at index:", i)

	scores = cosine_similarity([base_embeddings], vectors).flatten()

	highest_score = 0
	highest_score_index = 0
	for i, score in enumerate(scores):
		if highest_score < score:
			highest_score = score
			highest_score_index = i

	most_similar_document = documents[highest_score_index]

	#Thresholding 0.50
	if highest_score >= 0.50:
		return 1
	else:
		return 0

**Universal Sentence Encoder**

In [None]:
def process_use_similarity(base_document, documents):
	filename = "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3"
	model = hub.load(filename)
	base_embeddings = model([base_document])
	embeddings = model(documents)
	scores = cosine_similarity(base_embeddings, embeddings).flatten()

	highest_score = 0
	highest_score_index = 0
	for i, score in enumerate(scores):
		if highest_score < score:
			highest_score = score
			highest_score_index = i

	most_similar_document = documents[highest_score_index]

	#Thresholding 0.50
	if highest_score >= 0.50:
		return 1
	else:
		return 0

**Text Preprocessing**

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def text_preprocessing(sentence):
  sentence=str(sentence)
  sentence = sentence.lower()
  sentence=sentence.replace('{html}',"")
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', sentence)
  rem_url=re.sub(r'http\S+', '',cleantext)
  rem_num = re.sub('[0-9]+', '', rem_url)
  tokenizer = RegexpTokenizer(r'\w+')
  tokens = tokenizer.tokenize(rem_num)
  filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
  stem_words=[stemmer.stem(w) for w in filtered_words]
  lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
  return " ".join(filtered_words)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Main Code
df = pd.read_csv('./Text_Similarity.csv')
cols = ['Tf-Idf', 'Jaccard', 'Doc2Vec', 'BERT', 'Universal_Sentence_Encoder']
df = pd.concat([df, pd.DataFrame(columns=cols)])
list1 = df['text1'].tolist()
list2 = df['text2'].tolist()
length = len(list1)

#Main Loop starts here
for i in range(5):
  #Text1 preprocessing
  x = text_preprocessing(list1[i])
  x = [x]
  #Text2 preprocessing
  y = text_preprocessing(list2[i])
  y = [y]

  #TF-IDF Scores Computation
  tf_idf_score = process_tfidf_similarity(x, y)
  df['Tf-Idf'][i] = tf_idf_score

  #Jaccard Scores Computation
  jaccard_score = process_jaccard_similarity(x, y)
  df['Jaccard'][i] = jaccard_score

  """
  #Doc2Vec Scores Computation
  doc2vec_score = process_doc2vec_similarity(x, y)
  df['Doc2Vec'][i] = doc2vec_score
  """

  #BERT Scores Computation
  bert_score = process_bert_similarity(x, y)
  df['BERT'][i] = bert_score

  #Universal Sentence Encoder Scores Computation
  use_score = process_use_similarity(x, y)
  df['Universal_Sentence_Encoder'][i] = use_score

.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
df.head()

Unnamed: 0,text1,text2,Tf-Idf,Jaccard,Doc2Vec,BERT,Universal_Sentence_Encoder
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...,1,1,,1,0
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...,0,0,,1,0
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...,0,0,,1,0
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...,0,0,,1,0
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...,0,1,,1,0


**Writing to a CSV File**

In [None]:
df.to_csv('/content/text_similarity.csv')