###### 1 - Getting list of quotes



In [5]:
# Grabbing quotes
%%capture
import requests                     # To make 'get' requests through chrome browser
from bs4 import BeautifulSoup       # To parse html file in python tree object
import pandas as pd                 # To create Dataframe and save data into JSON file

import random
import time
from pprint import pprint

# There are 33 pages of quotes we want to collect from
urls = []
base_url = "https://www.goodreads.com/author/quotes/850512.J_Krishnamurti?page={}"
search_pages = [base_url.format(str(i)) for i in range(1,34)]

plain_quotes = []

for search_page in search_pages:
  # Grabbing a list of quotes:
  quote_list = []
  page = requests.get(search_page)
  soup = BeautifulSoup(page.content, 'html.parser')

  # Using CSS Selector to grab all quotes
  quotes = [quote.get_text() for quote in soup.find_all("div", class_="quoteText")]
  quotes = [quote.split("\n")[1].strip() for quote in quotes]
  plain_quotes.extend(quotes)


In [6]:
plain_quotes = [quote[1:-1] for quote in plain_quotes]
plain_quotes[0]

'It is no measure of health to be well adjusted to a profoundly sick society.'

###### 2 - Preprocessing

In [8]:
# Creating one long string
processed_article = ""
for quote in plain_quotes:
  processed_article += quote

In [12]:
# Cleaing the text
import re

processed_article = processed_article.lower() # making words lowercase
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article) # removing all digits
processed_article = re.sub(r'\s+', ' ', processed_article) # Removing punctuation

In [None]:
%%capture
# Dividing text into sentences and removing stop words
import nltk
nltk.download('punkt')
nltk.download('stopwords')

all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

###### Training word2vec model

In [17]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=2)



In [18]:
vocabulary = word2vec.wv.vocab
print(vocabulary)

{'measure': <gensim.models.keyedvectors.Vocab object at 0x7f7680011890>, 'health': <gensim.models.keyedvectors.Vocab object at 0x7f7680011c90>, 'well': <gensim.models.keyedvectors.Vocab object at 0x7f7680013b50>, 'adjusted': <gensim.models.keyedvectors.Vocab object at 0x7f767ff46350>, 'profoundly': <gensim.models.keyedvectors.Vocab object at 0x7f767fef25d0>, 'sick': <gensim.models.keyedvectors.Vocab object at 0x7f767fefbe90>, 'society': <gensim.models.keyedvectors.Vocab object at 0x7f767fefbe50>, 'ability': <gensim.models.keyedvectors.Vocab object at 0x7f767fefbf90>, 'observe': <gensim.models.keyedvectors.Vocab object at 0x7f767fef2510>, 'without': <gensim.models.keyedvectors.Vocab object at 0x7f767fef4f50>, 'evaluating': <gensim.models.keyedvectors.Vocab object at 0x7f767fefbfd0>, 'highest': <gensim.models.keyedvectors.Vocab object at 0x7f767fefc050>, 'form': <gensim.models.keyedvectors.Vocab object at 0x7f767fefc0d0>, 'intelligence': <gensim.models.keyedvectors.Vocab object at 0x7f76

dict

In [None]:
v1 = word2vec.wv['ideas']
v2 = word2vec.wv['mind']
v1

In [49]:
conflict = word2vec.wv['innocent']
struggle = word2vec.wv['young']
print(type(conflict))

<class 'numpy.ndarray'>


In [43]:
sim_words = word2vec.wv.most_similar('joy')

In [44]:
print(sim_words)

[('must', 0.6857403516769409), ('want', 0.6840440630912781), ('truth', 0.6839771270751953), ('pleasure', 0.683704137802124), ('action', 0.680683970451355), ('really', 0.6761115789413452), ('fear', 0.6720224618911743), ('love', 0.6720192432403564), ('mind', 0.6659005284309387), ('human', 0.6640744209289551)]


###### 4 - Associating vectors with documents

In [47]:

random_quote = plain_quotes[0]
print(random_quote)

It is no measure of health to be well adjusted to a profoundly sick society.


In [102]:
import numpy as np
from nltk.tokenize import wordpunct_tokenize

x = np.array(25 * [float(0)])
y = np.array(25 * [float(0)])


for word in wordpunct_tokenize(random_quote.lower()):
  x += glove_vectors[word]
  
for word in wordpunct_tokenize("gah gah gah gah gah gah gah".lower()):
  y += glove_vectors[word]


In [103]:
y

array([  5.11356997,   6.21845007,   1.51851001,  -1.41056998,
        -6.75380987,  -2.20416009,   6.14788979,  -0.93653001,
        -1.35121   ,   8.34539962,  -7.02030003,   1.66677004,
       -11.77679992,  -2.25994992,   2.19751003,   2.93748003,
         4.5456602 ,  -6.69340003,  -2.49962997,   1.51508005,
         3.17582995,   9.46679962,   2.23671007,   5.15480006,
         2.95448998])

In [101]:
from numpy import dot
from numpy.linalg import norm

#calculate Cosine Similarity python
result = dot(x, y)/(norm(x)*norm(y))
print("The Cosine Similarity between two vectors is: ",result)


The Cosine Similarity between two vectors is:  0.5127677116113161


###### solving some problems

In [71]:
# Importing a pretrianed model
import gensim.downloader

In [72]:
import gensim.downloader as api

info = api.info()  # show info about available models/datasets

In [None]:
# Looking at available datasets
from pprint import pprint 
pprint(info)

In [104]:
# Choosing text8 dataset (50d word embeddings), more info here: http://mattmahoney.net/dc/textdata.html 
# Download the "text8" embeddings
glove_vectors = gensim.downloader.load('text8')

In [106]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

corpus = api.load('text8')  # download the corpus and return it opened as an iterable
model = Word2Vec(corpus)  # train a model from the corpus
model.most_similar("car")

  


[('driver', 0.8011038303375244),
 ('motorcycle', 0.7185970544815063),
 ('cars', 0.7168217897415161),
 ('taxi', 0.7089043259620667),
 ('truck', 0.7045259475708008),
 ('racing', 0.670089840888977),
 ('vehicle', 0.6648951172828674),
 ('passenger', 0.6544437408447266),
 ('automobile', 0.6478562355041504),
 ('jaguar', 0.6357719898223877)]

In [77]:
# Download the "glove-twitter-25" embeddings
glove_vectors = gensim.downloader.load('glove-twitter-25')



[('facebook', 0.9480051398277283),
 ('tweet', 0.9403422474861145),
 ('fb', 0.9342358708381653),
 ('instagram', 0.9104823470115662),
 ('chat', 0.8964964747428894),
 ('hashtag', 0.8885936141014099),
 ('tweets', 0.8878157734870911),
 ('tl', 0.8778461813926697),
 ('link', 0.877821147441864),
 ('internet', 0.8753897547721863)]

In [79]:
glove_vectors.most_similar('money')

[('pay', 0.9287682175636292),
 ('some', 0.9172768592834473),
 ('need', 0.9159395694732666),
 ('cash', 0.9132940173149109),
 ('any', 0.9130824208259583),
 ('get', 0.9045761227607727),
 ('these', 0.9019135236740112),
 ('them', 0.9014986157417297),
 ('give', 0.8999203443527222),
 ('bring', 0.8983911871910095)]

In [82]:
glove_vectors['life']

array([-0.11048  ,  0.13274  , -0.34893  ,  0.48077  ,  0.17184  ,
        0.21367  ,  1.9979   , -0.065145 , -0.45437  , -0.0089287,
       -0.91636  ,  0.49681  , -5.0971   ,  0.38244  , -0.085692 ,
        0.54055  , -0.12858  , -0.079061 ,  0.4292   , -0.12321  ,
       -0.13697  , -0.19238  , -0.25163  ,  0.45215  ,  0.051584 ],
      dtype=float32)

###### Hello
