In [5]:
import requests
from bs4 import BeautifulSoup

In [1]:
url = "https://www.goodreads.com/quotes/tag/{}?page={}"

In [2]:
emotions = ["love", "religion"]

In [3]:
complete = url.format(emotions[0], 1)

In [4]:
complete

'https://www.goodreads.com/quotes/tag/love?page=1'

In [17]:
def get_quotes(complete):
    data = requests.get(complete)
    soup = BeautifulSoup(data.text)
    divs = soup.find_all("div", attrs={"class":"quoteText"})
    quotes = [div.text.strip().split("\n")[0][1:-1] for div in divs]
    return quotes

In [18]:
X, y = [], []

for emotion in emotions:
    for i in range(1, 6):
        complete = url.format(emotion, i)
        quotes = get_quotes(complete)
        X.extend(quotes)
        y.extend([emotion] * len(quotes))
        print("processes {} for {}".format(i, emotion))

processes 1 for love
processes 2 for love
processes 3 for love
processes 4 for love
processes 5 for love
processes 1 for religion
processes 2 for religion
processes 3 for religion
processes 4 for religion
processes 5 for religion


In [19]:
import pandas as pd

In [22]:
df = pd.DataFrame(list(zip(y, X)), columns=["emotion", "quote"])

In [24]:
df.to_csv("emotions.csv", index=False)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [103]:
vect = CountVectorizer(max_features=500)

In [104]:
vect.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [105]:
len(vect.vocabulary_)

500

In [106]:
X[0]

"I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best."

In [107]:
X_mod = vect.transform(X).todense()

In [108]:
len(X)

300

In [109]:
X_mod.shape

(300, 500)

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [111]:
X_train, X_test, y_train, y_test = train_test_split(
...     X_mod, y, test_size=0.2, random_state=42)

In [112]:
model = KNeighborsClassifier()

In [113]:
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [114]:
model.score(X_test, y_test)

0.7

In [115]:
from sklearn.naive_bayes import BernoulliNB

In [116]:
model = BernoulliNB()

In [117]:
model.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [118]:
model.score(X_test, y_test)

0.8833333333333333

In [122]:
line = "To me, Fearless is not the absense of fear. It's not being completely unafraid. To me, Fearless is having fears. Fearless is having doubts. Lots of them. To me, Fearless is living in spite of those things that scare you to death."

In [123]:
X_vec = vect.transform([line]).todense()

In [124]:
model.predict(X_vec)

array(['religion'], dtype='<U8')

In [125]:
vect.vocabulary_

{'selfish': 369,
 'and': 15,
 'little': 229,
 'make': 248,
 'mistakes': 270,
 'am': 13,
 'out': 317,
 'of': 297,
 'control': 69,
 'at': 27,
 'times': 443,
 'hard': 165,
 'to': 444,
 'but': 52,
 'if': 192,
 'you': 497,
 'can': 56,
 'me': 257,
 'my': 280,
 'worst': 492,
 'then': 429,
 'sure': 416,
 'as': 25,
 'hell': 177,
 'don': 90,
 'deserve': 78,
 'best': 43,
 've': 460,
 'like': 228,
 'there': 430,
 'nobody': 291,
 'love': 241,
 'll': 233,
 'never': 286,
 'be': 33,
 'hurt': 189,
 'sing': 384,
 'live': 230,
 'it': 202,
 'heaven': 175,
 'on': 301,
 'earth': 99,
 'know': 214,
 're': 344,
 'in': 195,
 'when': 476,
 'fall': 119,
 'asleep': 26,
 'because': 37,
 'is': 200,
 'finally': 126,
 'better': 44,
 'than': 423,
 'your': 498,
 'dreams': 96,
 'friend': 136,
 'someone': 393,
 'who': 479,
 'knows': 217,
 'all': 8,
 'about': 1,
 'still': 407,
 'loves': 244,
 'darkness': 74,
 'cannot': 57,
 'only': 305,
 'light': 227,
 'do': 86,
 'that': 424,
 'hate': 167,
 'we': 471,
 'the': 425,
 'think'