In [2]:
! pip install requests
! pip install beautifulsoup4
! pip install pandas
! pip install gensim
! pip install spacy
! python -m spacy download en_core_web_sm
! pip install scikit-learn
! pip install numpy
! pip install scipy==1.12


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


baseUrl = 'https://quotes.toscrape.com/page/'

quotes_data = []
tag_frequency = {}

for page in range(1,11):
    url = baseUrl + str(page) + '/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    quote_blocks = soup.find_all('div', class_='quote')
    
    for quote_block in quote_blocks:
        text = quote_block.find('span', class_='text').get_text()
        tags = [tag.get_text() for tag in quote_block.find_all('a', class_='tag')]
        quotes_data.append({"quote": text, "tags": tags})
        for tag in tags:
            if tag in tag_frequency:
                tag_frequency[tag] += 1
            else:
                tag_frequency[tag] = 1


updated_quotes_data = []
for quote in quotes_data:
    if quote["tags"]:
        max_tag = max(quote["tags"], key=lambda tag: tag_frequency.get(tag, 0))
        updated_quotes_data.append({"quote": quote["quote"], "tags": max_tag})
        

data = pd.DataFrame(updated_quotes_data)

print(f"Number of quotes: {len(data)}")
print(f"Number of unique labels: {len(data['tags'].unique())}")



ModuleNotFoundError: No module named 'bs4'

* The possible options are BoW, Tf-Idf and Word Embeddings. 
    BoW: This is the most simple one, it basically tokenizer words and creates a vector out of frequency of these words for each sentence.
    Tf-Idf: This is a method that I have mainly used for SEO keyword extraction therefore I am experienced with. It is also frequency based model, it simply assigns a statistical value to each word by using mathematical formulas. The value signifies the importance of word.
    Word Embedding: This is the most advanced method, it uses NLP and pretrained models to create vectors. Also, this method is commonly used by me for QA retrieval tasks. 
* General and common approach would be choosing word embeddings among these three I think. Since we have very limited and not very well labeled data, using an already trained model on a very large dataset would be beneficial. Also we would have been loss any semantic relationship between words if word embeddings are not utilized. The frequency based models would be enough for some part of data especially when label word exists inside the quote, however that is not the case for all. The quotes are short and data 

In [15]:
import gensim.downloader
from gensim.models import KeyedVectors
import numpy as np

embedding_model = gensim.downloader.load('word2vec-google-news-300')


ImportError: cannot import name 'triu' from 'scipy.linalg' (/Users/emrekoc/.pyenv/versions/3.12.2/lib/python3.12/site-packages/scipy/linalg/__init__.py)

In [None]:
#For reference this function is taken from a medium article: https://medium.com/@reddyyashu20/build-text-classification-model-using-word2vec-nlp-part2-52aa2839e8f4
def vectorize_sentence(sentence):
    vector_size = embedding_model.vector_size
    sentence_vector = np.zeros(vector_size)
    counter = 1
    for word in sentence:
        if word in embedding_model:
            counter += 1
            sentence_vector += embedding_model[word]
    sentence_vector = sentence_vector/counter
    return sentence_vector

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

#python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# Define stop words and punctuations
stop_words = STOP_WORDS
punctuations = string.punctuation

def tokenizer(sentence):
    doc = nlp(sentence)
    tokens = [word.lemma_.lower().strip() for word in doc]
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations]
    return tokens


In [None]:
from sklearn.preprocessing import LabelEncoder

data['quote_token'] = data['quote'].apply(tokenizer)
data['vector_embedding'] = data['quote_token'].apply(vectorize_sentence)

label_encoder = LabelEncoder()
data["label"] = label_encoder.fit_transform(data["tag"])




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

X = data['vector_embedding'].to_list()
y = data['label'].to_list()
X_train, X_test, y_train, y_test = train_test_split (X, y , test_size=0.2)

model = MultinomialNB()
model.fit(X_train, y_train)



In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))