### Sentimal Analysis for goodreads quotes



In [67]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [291]:
url = "https://www.goodreads.com/quotes/tag/{}?page={}"
emotions = ['inspirational', 'love', 'knowledge']
complete_url = url.format(emotions[0], 1)


In [292]:
def get_quote(complete_url):
    data = requests.get(complete_url)
    soup = BeautifulSoup(data.text)
    divs = soup.find_all('div', attrs={'class': 'quoteText'})
    quotes = [d.text.strip().split('\n')[0][1:-1] for d in divs]
    return quotes

In [247]:
print(get_quote(complete_url))

['Be yourself; everyone else is already taken.', "You've gotta dance like there's nobody watching,Love like you'll never be hurt,Sing like there's nobody listening,And live like it's heaven on earth.", 'Be the change that you wish to see in the world.', 'Live as if you were to die tomorrow. Learn as if you were to live forever.', 'Darkness cannot drive out darkness: only light can do that. Hate cannot drive out hate: only love can do that.', 'Without music, life would be a mistake.', 'We accept the love we think we deserve.', "Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.", 'There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.', 'We are all in the gutter, but some of us are looking at the stars.', 'Fairy tales are more than true: not because they tell us that dragons exist, but because they tell us that dragons can be beaten.', 'Yesterday is histo

#### Get quotes for each emotion

In [293]:
x, y = [], []

for emo in emotions:
    for i in range(1, 10):
        complete_url = url.format(emo, i)
        quotes = get_quote(complete_url)
        x.extend(quotes)
        y.extend([emo] * len(quotes))
        print(f'Page {i} for {emo} tag processed.')

Page 1 for inspirational tag processed.
Page 2 for inspirational tag processed.
Page 3 for inspirational tag processed.
Page 4 for inspirational tag processed.
Page 5 for inspirational tag processed.
Page 6 for inspirational tag processed.
Page 7 for inspirational tag processed.
Page 8 for inspirational tag processed.
Page 9 for inspirational tag processed.
Page 1 for love tag processed.
Page 2 for love tag processed.
Page 3 for love tag processed.
Page 4 for love tag processed.
Page 5 for love tag processed.
Page 6 for love tag processed.
Page 7 for love tag processed.
Page 8 for love tag processed.
Page 9 for love tag processed.
Page 1 for knowledge tag processed.
Page 2 for knowledge tag processed.
Page 3 for knowledge tag processed.
Page 4 for knowledge tag processed.
Page 5 for knowledge tag processed.
Page 6 for knowledge tag processed.
Page 7 for knowledge tag processed.
Page 8 for knowledge tag processed.
Page 9 for knowledge tag processed.


In [225]:
x[3], y[3], x[-2], y[-3], len(x)

('Live as if you were to die tomorrow. Learn as if you were to live forever.',
 'inspirational',
 'You are, and always have been, my dream.',
 'love',
 120)

In [294]:
df = pd.DataFrame(list(zip(y, x)), columns=['emotion', 'quotes'])
df.head(-1)
df.to_csv('emotion.csv', index=False)

#### ML part

In [176]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [296]:
vect = CountVectorizer(max_features=1000)
vect.fit(x)

CountVectorizer(max_features=1000)

In [218]:
len(vect.vocabulary_)
# vect.vocabulary_

500

In [297]:
x_mod = vect.transform(x).todense()
# x_mod[-1]

In [298]:
x[-1]

'The study of truth requires a considerable effort - which is why few are willing to undertake it out of love of knowledge - despite the fact that God has implanted a natural appetite for such knowledge in the minds of men.'

#### if want to check how many time some of the word is apearing in quotes, you could check it with `vect.transform(x).todense()`, for example 'as' in quote `x[-1]` is appearing 3 times `x_mod[-1]`

#### if you want to replace different meanings of same word, like run, running, ran to just 'run' or love, loving, loved to 'love' you could use `PorterStemmer().stem('loved')` which give you 'love'

In [299]:
tokenizer = RegexpTokenizer('\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [300]:
def getStemmedQuote(quote):
    quote = quote.lower()

    # tokenize, or just making a list from a quote
    tokens = tokenizer.tokenize(quote)

    # removed stopwords
    new_tokens = [token for token in tokens if token not in sw]

    # stemming words, bad thing - this internal stemmer is sometines removing or chaning words endings
    stemmed_tokens = [ps.stem(token) for token in new_tokens]

    clean_quote = ' '.join(stemmed_tokens)

    return clean_quote

In [301]:
def getStemmedQuotes(quotes):
    return [getStemmedQuote(q) for q in quotes]


In [302]:
X = getStemmedQuotes(x)
vect.fit(X)

CountVectorizer(max_features=1000)

In [303]:
X_mod = vect.transform(X).todense()
X_mod.shape

(810, 1000)

In [304]:
X_train, X_test, y_train, y_test = train_test_split(X_mod, y, test_size=0.33, random_state=42)

In [305]:
model = BernoulliNB()

In [306]:
model.fit(X_train, y_train)

BernoulliNB()

In [307]:
model.score(X_test, y_test)

0.6343283582089553

In [308]:
# lets test our model with random quote
line_inspirational = "The pessimist sees difficulty in every opportunity."
line_love = "You call it madness, but I call it love."
line_love2 = "We can only learn to love by loving."
X_vec = vect.transform([line_inspirational, line_love, line_love2]).todense()
X_vec2 = vect.transform([line_love]).todense()

In [309]:
model.predict(X_vec)

array(['inspirational', 'inspirational', 'inspirational'], dtype='<U13')

In [217]:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]

2