In [7]:
from gensim.models import Word2Vec
import re
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

nltk.download("punkt")
nltk.download("stopwords")

from google.colab import files
files.upload()

In [9]:
with open("TAGRBook.txt", "r") as file:
  text = file.read()

In [20]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r"[^a-z\s]", "", text)
  words = word_tokenize(text)
  stop_words = set(stopwords.words("english"))
  words = [word for word in words if word not in stop_words]
  return words

preprocessed_tokens = preprocess_text(text)
model = Word2Vec([preprocessed_tokens], vector_size = 100, window = 6, min_count = 2)

In [28]:
model.wv.get_vector("think")

array([-0.01267777,  0.01796552, -0.00219926,  0.00161425, -0.00084724,
       -0.0185977 ,  0.00697886,  0.03065144,  0.00154894, -0.01933925,
        0.007396  , -0.01195818, -0.00290479,  0.00213485,  0.00998054,
       -0.0107914 ,  0.01256396, -0.02263052, -0.00796313, -0.01872309,
        0.0104015 ,  0.00902501,  0.02184552,  0.00580603, -0.01936772,
        0.00967211, -0.00110262, -0.0087715 , -0.01063613,  0.00077085,
        0.01981343, -0.00094367, -0.00576713, -0.01195841,  0.00026407,
        0.02141009,  0.01308144, -0.01505016, -0.02391373, -0.00155212,
        0.0013496 , -0.00348373,  0.00040865, -0.00634377,  0.0224602 ,
       -0.00689913, -0.00772354, -0.00088608,  0.01557755,  0.00542442,
        0.00758914, -0.01879852, -0.0172336 , -0.00968712, -0.0190212 ,
        0.00103403,  0.00831111,  0.00861824, -0.00458203, -0.00194486,
        0.01220685,  0.00679059,  0.01068457, -0.00334453, -0.00770796,
        0.00954027, -0.01234034,  0.02077216, -0.01120212,  0.00

In [29]:
model.wv.most_similar("think")

[('grow', 0.7852849960327148),
 ('rich', 0.7720537185668945),
 ('faith', 0.6684492826461792),
 ('wwwthinkandgrowrichebookcom', 0.6601950526237488),
 ('never', 0.6595385670661926),
 ('riches', 0.6583791971206665),
 ('work', 0.6465651988983154),
 ('mind', 0.6456536650657654),
 ('thought', 0.6428698301315308),
 ('one', 0.6423202753067017)]

In [43]:
think_and_grow_rich_chapters = re.split(r"CHAPTER \d", text)[16:]

In [68]:
def get_document_vector(model, document_tokens):
  modified_doc = [model.wv.get_vector(word) for word in document_tokens if word in model.wv.index_to_key]
  return np.array(modified_doc).mean(axis = 0)

In [73]:
preprocessed_chapters = [preprocess_text(chapter) for chapter in think_and_grow_rich_chapters]
chapters_document_vector = [get_document_vector(model, chapter) for chapter in preprocessed_chapters]

In [80]:
query = "desire is the most impotant step towards attaining your goal"
query_modified = preprocess_text(query)
query_vector = get_document_vector(model, query_modified)

In [81]:
similarities = cosine_similarity([query_vector], chapters_document_vector)

In [85]:
ranked_chapters = sorted(enumerate(similarities[0]), key = lambda item: item[1], reverse=True)

In [88]:
think_and_grow_rich_chapters[ranked_chapters[0][0]]

'\nINTRODUCTION\nTHE MAN WHO "THOUGHT"\nHIS WAY INTO PARTNERSHIP\nWITH THOMAS A. EDISON\nTRULY, "thoughts are things," and powerful things at that, when they are\nmixed with definiteness of purpose, persistence, and a BURNING DESIRE for\ntheir translation into riches, or other material objects.\nA little more than thirty years ago, Edwin C. Barnes discovered how true\nit is that men really do THINK AND GROW RICH. His discovery did not come\nabout at one sitting. It came little by little, beginning with a BURNING DESIRE\nto become a business associate of the great Edison.\nOne of the chief characteristics of Barnes\' Desire was that it was definite.\nHe wanted to work with Edison, not for him. Observe, carefully, the description\nof how he went about translating his DESIRE into reality, and you will have a\nbetter understanding of the thirteen principles which lead to riches.\nWhen this DESIRE, or impulse of thought, first flashed into his mind he\nwas in no position to act upon it. Two