In this notebook, I will follow an introductory tutorial to using GenSim's Doc2Vec Class for topic modeling.

In [5]:
%%capture
!pip install gensim

# installing dependencies
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import nltk
from nltk.tokenize import word_tokenize

# For punctuation
nltk.download('punkt')

In [3]:
# Grabbing quotes
%%capture
import requests                     # To make 'get' requests through chrome browser
from bs4 import BeautifulSoup       # To parse html file in python tree object
import pandas as pd                 # To create Dataframe and save data into JSON file

import random
import time
from pprint import pprint

# There are 33 pages of quotes we want to collect from
urls = []
base_url = "https://www.goodreads.com/author/quotes/850512.J_Krishnamurti?page={}"
search_pages = [base_url.format(str(i)) for i in range(1,34)]

plain_quotes = []

for search_page in search_pages:
  # Grabbing a list of quotes:
  quote_list = []
  page = requests.get(search_page)
  soup = BeautifulSoup(page.content, 'html.parser')

  # Using CSS Selector to grab all quotes
  quotes = [quote.get_text() for quote in soup.find_all("div", class_="quoteText")]
  quotes = [quote.split("\n")[1].strip() for quote in quotes]
  plain_quotes.extend(quotes)


In [6]:
# Loading in Data (Quote Documents)
data = plain_quotes

# tagging data - returns a list of tagged documents
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [7]:
# Initializing doc2vec model
model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=80)

In [8]:
# Building vocabulary of tagged data
model.build_vocab(tagged_data)

In [9]:
# Training doc2vec model
model.train(tagged_data, total_examples=model.corpus_count, epochs=80)

In [10]:
# Saving doc2vec model
model.save("d2v.model")

### Let's compute similarity between sentences

In [11]:
# Loading back doc2vec model
model = Doc2Vec.load("d2v.model")

In [12]:
# Finding most similar to first
similar_doc = model.docvecs.most_similar()
print(similar_doc[0])

('736', 0.7872136831283569)


In [14]:
print(plain_quotes[0], plain_quotes[736])

“It is no measure of health to be well adjusted to a profoundly sick society.” “The ending of sorrow is the beginning of wisdom”


In [15]:
# Inferring a vector 
question = "why do I feel angry and upset"
test_data = word_tokenize(question.lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

V1_infer [ 0.32743317 -0.11341423 -0.05435495 -0.17307845 -0.51034623 -0.5700644
 -0.08266234  0.23617806 -0.33206156  0.51621675 -0.0356981  -0.3887038
  0.9533508  -0.14532383 -0.27231976 -0.4267477   0.06877901 -0.10505804
  0.46134123 -0.03594124 -0.43858963 -0.41964036 -0.66404575  0.01620666
  0.12459861 -0.33795166 -0.13260795 -0.07641975  0.3626482   0.18010819
 -0.08157532  0.29649046 -0.08267613  0.2792874   0.11534569 -0.81928325
 -1.109061   -0.37997323  0.21994264  0.26516956]


In [17]:
model.most_similar(positive=["life"])

  """Entry point for launching an IPython kernel.


[('daily', 0.6577692031860352),
 ('this', 0.6107522249221802),
 ('art', 0.6048147678375244),
 ('secret', 0.5460391044616699),
 ('part', 0.54332435131073),
 ('problem', 0.5106348991394043),
 ('pleasure', 0.5057826042175293),
 ('conduct', 0.5043268203735352),
 ('it', 0.5028849840164185),
 ('escape', 0.4822239279747009)]