In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import gensim as gm
import nltk
import re 

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
page = 1

authors_lst = []
quotes_lst = []
tags_lst = []


while page < 11: 
    url = 'http://quotes.toscrape.com/page/{}'.format(page)
    scrape = requests.get(url)
    parsed = BeautifulSoup(scrape.content, "html.parser")
    
    curr_authors = list(map(lambda x: x.text, parsed.find_all(class_ = "author")))
    curr_quotes = list(map(lambda x: x.text, parsed.find_all(class_ = 'text')))
    
    tag_groups = parsed.find_all("div", class_ = "tags")

    curr_tags = []
    for i in range(len(tag_groups)):

        tags = tag_groups[i].find_all("a", class_ = "tag")

        curr_group = []
        for tag in tags:

            curr_group.append(tag.text)

        curr_tags.append(curr_group)
    
    authors_lst += curr_authors
    quotes_lst += curr_quotes
    tags_lst += curr_tags
    
    page += 1
    
data = {"Author":authors_lst, "Quote":quotes_lst, "Tags":tags_lst}
df = pd.DataFrame(data)

In [3]:
def quotes_cleaning(text):
    
    text = text.lower()
    
    text = re.sub('[^A-Za-z\s]', '', text)
    
    return text

df['Quote'] = df['Quote'].apply(quotes_cleaning)
df['Author'] = df['Author'].apply(quotes_cleaning)

In [4]:
def tags_cleaning(text):
    
    if len(text) == 0:
        
        return np.NaN
    
    text = re.sub('[\[ \]]', ' ', str(text))
    text = re.sub('[^\w]', ' ', text)
    text = re.sub('[\s]', ' ', text)
    
    return text

df['Tags'] = df['Tags'].apply(tags_cleaning)

In [5]:
# list(df['Tags'])
pd.set_option('display.max_rows', 100)

df = df.dropna()
#df['Tags'] = df['Tags'].apply(lambda x: ' '.join(x.split())).apply(lambda x: x.split(' '))

In [6]:
x = df.drop(columns = ['Tags'])
y = df['Tags']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [7]:
to_be_encoded = ['Quote']

onehot = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore'))])
transformed = ColumnTransformer([('onehot', onehot, to_be_encoded)])

prediction_pl = Pipeline([('transformer', transformed), ('classifier', DecisionTreeClassifier())])

prediction_pl.fit(x_train, y_train)
prediction_pl.predict(x_test)#, y_test)


array(['  inspirational  ', '  inspirational  ', '  inspirational  ',
       '  inspirational  ', '  inspirational  ', '  inspirational  ',
       '  inspirational  ', '  inspirational  ', '  inspirational  ',
       '  inspirational  ', '  inspirational  ', '  inspirational  ',
       '  inspirational  ', '  inspirational  ', '  inspirational  ',
       '  inspirational  ', '  inspirational  ', '  inspirational  ',
       '  inspirational  ', '  inspirational  ', '  inspirational  ',
       '  inspirational  ', '  inspirational  ', '  inspirational  ',
       '  inspirational  ', '  inspirational  ', '  inspirational  ',
       '  inspirational  ', '  inspirational  ', '  inspirational  '],
      dtype=object)