In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import re
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

text = "Jim is my enemy. But, Jim is also his own greatest enemy. And, if the enemy of my enemy is my friend, then that means Jim is my friend. But, if he's my friend then the enemy of my friend is also my enemy. Therefore, Jim is my enemy!"

lem = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
sentences = sent_tokenize(text)

res = list(map(' '.join, [[lem.lemmatize(word) for word in word_tokenize(re.sub(r'[^\w]', ' ', sentence)) if word not in stop_words] for sentence in sentences]))

def count_vectorize(corpus):
    cv = CountVectorizer(max_features=100)
    transform = cv.fit_transform(corpus).toarray()
    return pd.DataFrame(columns=cv.get_feature_names_out(), data=transform)

def tfidf_vectorize(corpus):
    # term frequency (TF) is the amount of times a term 't' appears in a document divided by total terms in the document
    # inverse document frequency (IDF) is the whole log of total documents divided by number of documents that contain 't'
    # TF-IDF is the product of the two and a high value for it indicates that the term is rare across the entire corpus except the current document

    tfidf = TfidfVectorizer()
    transform = tfidf.fit_transform(corpus).toarray()
    return np.round(pd.DataFrame(columns=tfidf.get_feature_names_out(), data=transform), 3)

def cosine_similarity(A, B):
    return np.round(np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B)), 3)

print('Bag of Words/Count Vectorization:')
display(count_vectorize(res))
print('TF-IDF vectorization:')
tfidf_df = tfidf_vectorize(res)
display(tfidf_df)

print('Sentences:', *res, sep='\n', end='\n\n')
for i in range(len(tfidf_df)):
    for j in range(i+1, len(tfidf_df)):
        similarity = cosine_similarity(tfidf_df.iloc[i].values, tfidf_df.iloc[j].values)
        print(f'Cosine similarity between sentence {i+1} and {j+1}: {similarity}')


Bag of Words/Count Vectorization:


Unnamed: 0,also,and,but,enemy,friend,greatest,jim,mean,therefore
0,0,0,0,1,0,0,1,0,0
1,1,0,1,1,0,1,1,0,0
2,0,1,0,2,2,0,1,1,0
3,1,0,1,2,2,0,0,0,0
4,0,0,0,1,0,0,1,0,1


TF-IDF vectorization:


Unnamed: 0,also,and,but,enemy,friend,greatest,jim,mean,therefore
0,0.0,0.0,0.0,0.646,0.0,0.0,0.764,0.0,0.0
1,0.478,0.0,0.478,0.282,0.0,0.593,0.334,0.0,0.0
2,0.0,0.414,0.0,0.395,0.668,0.0,0.233,0.414,0.0
3,0.368,0.0,0.368,0.434,0.735,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.383,0.0,0.0,0.453,0.0,0.805


Sentences:
Jim enemy
But Jim also greatest enemy
And enemy enemy friend mean Jim friend
But friend enemy friend also enemy
Therefore Jim enemy

Cosine similarity between sentence 1 and 2: 0.437
Cosine similarity between sentence 1 and 3: 0.433
Cosine similarity between sentence 1 and 4: 0.28
Cosine similarity between sentence 1 and 5: 0.593
Cosine similarity between sentence 2 and 3: 0.189
Cosine similarity between sentence 2 and 4: 0.474
Cosine similarity between sentence 2 and 5: 0.259
Cosine similarity between sentence 3 and 4: 0.663
Cosine similarity between sentence 3 and 5: 0.257
Cosine similarity between sentence 4 and 5: 0.166


In [25]:
words = list(map(word_tokenize, res))
w2v = Word2Vec(words, min_count=1)

words_to_compare = ['enemy', 'friend', 'Jim']
print('\nCosine similarities between word pairs:')
for i in range(len(words_to_compare)):
    for j in range(i + 1, len(words_to_compare)):
        similarity = cosine_similarity(w2v.wv[words_to_compare[i]], w2v.wv[words_to_compare[j]])
        print(f'Cosine similarity between "{words_to_compare[i]}" and "{words_to_compare[j]}": {similarity}')


Cosine similarities between word pairs:
Cosine similarity between "enemy" and "friend": -0.010999999940395355
Cosine similarity between "enemy" and "Jim": -0.052000001072883606
Cosine similarity between "friend" and "Jim": -0.024000000208616257
