## Importing Libraries & getting Data

In [1]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.goodreads.com/quotes/tag/{}?page={}'
sentiments = ['death', 'love']

In [3]:
death_quotes = url.format(sentiments[0],1)
death_quotes

'https://www.goodreads.com/quotes/tag/death?page=1'

In [4]:
def get_quotes(death_quotes):
    data = requests.get(death_quotes)
    soup = BeautifulSoup(data.text)
    divs = soup.find_all('div',attrs={'class' : 'quoteText'})

    quotes = [div.text.strip().split('\n')[0][1:-1] for div in divs]
    return quotes

quotes = get_quotes(death_quotes)

In [5]:
X, y =[] , []

for sentiment in sentiments:
    for i in range(1, 4):
        death_quotes = url.format(sentiment ,i)
        quotes = get_quotes(death_quotes)

        X.extend(quotes)
        y.extend([sentiment] * len(quotes))
        print(f'Processed page {i} for {sentiment} ')

Processed page 1 for death 
Processed page 2 for death 
Processed page 3 for death 
Processed page 1 for love 
Processed page 2 for love 
Processed page 3 for love 


In [6]:
X[69] , y[69]

('A man with outward courage dares to die; a man with inner courage dares to live.',
 'death')

## Converting to Dataframe

In [7]:
df = pd.DataFrame(list(zip(y,X)), columns=['sentiment' , 'quotes'])
df.to_csv('emotions.csv', index=False)

## NLP pipelining

In [8]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500)

In [10]:
tokenizer = RegexpTokenizer('\w+')
ps = PorterStemmer()
sw = set(stopwords.words('english'))

In [11]:
def getStemmedQuote(quote):
    quote = quote.lower()

    # tokenization --> breaking down the document i.e doc->sents->words )
    tokens = tokenizer.tokenize(quote)

    # stop-word removal
    new_tokens = [token for token in tokens if token not in sw]

    # Stemming --> convert diff forms of same word into single word
    stemmed_token = [ps.stem(token) for token in new_tokens]

    # Joining the stemmed quotes
    clean_quote = ' '.join(stemmed_token)
    return clean_quote


def getStemmedQuotes(quotes):
    d =[]
    for quote in quotes:
        d.append(getStemmedQuote(quote))
    return d


In [12]:
X = getStemmedQuotes(X)

In [13]:
cv.fit(X)

CountVectorizer(max_features=500)

In [14]:
print('Length of vocabulary : {}'.format(len(cv.vocabulary_)))

Length of vocabulary : 500


In [15]:
X_mod = cv.transform(X).todense()
X_mod

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Train Test Split

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_mod, y, test_size=0.33)

## Naive Bayes Model

In [17]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(X_train, y_train)

BernoulliNB()

In [18]:
print('Model Score --> {}'.format(model.score(X_test, y_test)))

Model Score --> 0.7833333333333333


## Checking Sample Quote

In [19]:
sample_line = "Gone from our sight, but never from our hearts"

X_sample = cv.transform([sample_line]).todense()

model.predict(X_sample)

array(['death'], dtype='<U5')