In [1]:
import requests
import re
import math
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import string
import pandas as pd

# download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# URLs of the two documents being compared
url_doc1 = 'https://www.ifc.org/en/insights-reports/2023/building-green-in-emerging-markets'
url_doc2 = 'https://www.ifc.org/en/insights-reports/2024/emerging-market-green-bonds-2023'

# fetch the text from the given URLs
def get_text_from_url(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text

text_doc1 = get_text_from_url(url_doc1)
text_doc2 = get_text_from_url(url_doc2)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paschalinaparaschou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/paschalinaparaschou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/paschalinaparaschou/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/paschalinaparaschou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Method 1: TF-IDF + Cosine Similarity
def preprocess_text_enhanced(text):
    text = text.lower()
    
    #remove digits and non-word characters
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)

    # split the text into words
    words = text.split()

    # set of English stop words
    stop_words = set(stopwords.words('english'))

    # remove stop words from the list of words
    words = [word for word in words if word not in stop_words]

    # initialize lemmatization
    lemmatizer = WordNetLemmatizer()

    # lemmatize each word in the list of words
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# preprocess the text of the two documents
preprocessed_text_doc1 = preprocess_text_enhanced(text_doc1)
preprocessed_text_doc2 = preprocess_text_enhanced(text_doc2)

# initialize the TF-IDF vectorizer
vectorizer3 = TfidfVectorizer()

# fit the vectorizer on the preprocessed documents and transform them into TF-IDF vectors
tfidf_matrix3 = vectorizer3.fit_transform([preprocessed_text_doc1, preprocessed_text_doc2])

# cosine similarity
cosine_sim_tf = cosine_similarity(tfidf_matrix3[0:1], tfidf_matrix3[1:2])[0][0]
#print(cosine_sim_tf)

In [3]:
# Method 2: Jaccard Similarity
def preprocess_text_simple(text):
    # convert to lowercase
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    # tokenize text
    tokens = word_tokenize(text)

    # set of English stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

def jaccard_similarity(text1, text2):
    # preprocess the text
    set1 = set(preprocess_text_simple(text1))
    set2 = set(preprocess_text_simple(text2))

    # calculate the common tokens of the two sets
    intersection = set1.intersection(set2)

    # calculate unique tokens from both texts
    union = set1.union(set2)
    return len(intersection) / len(union)

# cosine similarity
jaccard_sim = jaccard_similarity(text_doc1, text_doc2)

In [4]:
# Method 3: Cosine Similarity with Bag of Words

# remove punctuation and convert to lowercase
translation_table = str.maketrans(string.punctuation + string.ascii_uppercase, " " * len(string.punctuation) + string.ascii_lowercase)

def get_words_from_text(text):
    text = text.translate(translation_table)

    # split the translated text into a list of words
    word_list = text.split()
    return word_list

def count_frequency(word_list):
    # initiate empty dictionary
    D = {}
    for new_word in word_list:
        if new_word in D:
            D[new_word] += 1
        else:
            D[new_word] = 1
    return D

def word_frequencies_for_text(text):
    word_list = get_words_from_text(text)
    freq_mapping = count_frequency(word_list)
    return freq_mapping, len(word_list), len(freq_mapping)

def dot_product(D1, D2):
    Sum = 0.0
    for key in D1:
        if key in D2:
            Sum += D1[key] * D2[key]
    return Sum

def vector_angle(D1, D2):
    numerator = dot_product(D1, D2)

    # calculate the magnitudes of the frequency vectors
    denominator = math.sqrt(dot_product(D1, D1) * dot_product(D2, D2))

    # return the angle between the two vectors in radians
    return math.acos(numerator / denominator)

# Word frequency mappings and word counts for the two documents

freq_mapping1, words1, distinct_words1 = word_frequencies_for_text(text_doc1)
freq_mapping2, words2, distinct_words2 = word_frequencies_for_text(text_doc2)

distance = vector_angle(freq_mapping1, freq_mapping2)

# results
print(f"Bag of Words + Cosine Similarity (Radians): {distance:.4f}")

print("File 1:")
print(f"{words1} words,")
print(f"{distinct_words1} distinct words")

print("File 2:")
print(f"{words2} words,")
print(f"{distinct_words2} distinct words")

Bag of Words + Cosine Similarity (Radians): 0.1789
File 1:
13259 words,
1301 distinct words
File 2:
10634 words,
1079 distinct words


In [5]:
# Method 4: Doc2Vec + Cosine Similarity
def preprocess_text_for_doc2vec(text):
    return text.lower().split()

# preprocess the texts and create TaggedDocument objects
documents_for_doc2vec = [TaggedDocument(words=preprocess_text_for_doc2vec(text_doc1), tags=['doc1']),
                         TaggedDocument(words=preprocess_text_for_doc2vec(text_doc2), tags=['doc2'])]

# initialize the Doc2Vec model
model = Doc2Vec(vector_size=100, min_count=2, epochs=40)

# build the vocabulary from the preprocessed documents
model.build_vocab(documents_for_doc2vec)

# train the model
model.train(documents_for_doc2vec, total_examples=model.corpus_count, epochs=model.epochs)

vector1 = model.infer_vector(preprocess_text_for_doc2vec(text_doc1))
vector2 = model.infer_vector(preprocess_text_for_doc2vec(text_doc2))

# cosine similarity
cosine_sim_doc2vec = cosine_similarity([vector1], [vector2])[0][0]

In [6]:
# Summary of results table
from prettytable import PrettyTable 

# specify the column names of the table
myTable = PrettyTable(["Method", "Result"]) 

# add rows to the table
myTable.add_row(["Enhanced Preprocessed TF-IDF + Cosine Similarity", f"{cosine_sim_tf:.4f}" ]) 
myTable.add_row(["Jaccard Similarity", f"{jaccard_sim:.4f}" ]) 
myTable.add_row(["Bag of Words + Cosine Similarity (Radians)", f"{distance:.4f}" ]) 
myTable.add_row(["Doc2Vec + Cosine Similarity", f"{cosine_sim_doc2vec:.4f}"]) 
print(myTable)

print("The best method for similarity check is:", f"{cosine_sim_tf:.4f}")

+--------------------------------------------------+--------+
|                      Method                      | Result |
+--------------------------------------------------+--------+
| Enhanced Preprocessed TF-IDF + Cosine Similarity | 0.9802 |
|                Jaccard Similarity                | 0.4956 |
|    Bag of Words + Cosine Similarity (Radians)    | 0.1789 |
|           Doc2Vec + Cosine Similarity            | 0.7845 |
+--------------------------------------------------+--------+
The best method for similarity check is: 0.9802
Please look at the provided word document for further explanation
