In [14]:
# preprocessing & utilities
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# text handling 
import nltk
import re
import spacy
import string
from nltk.corpus import stopwords

# models for classification
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem.wordnet import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
def plot_cm(pipeline, X_test, y_test):
    plt.figure(figsize=(15,15))
    plot_confusion_matrix(pipeline, X_test, y_test)  
    plt.xticks(rotation=45, fontsize = 10)
    plt.yticks(rotation=0, fontsize = 8)
    plt.title("Confusion matrix")
    plt.show()
    
def eval_accuracy (pred, test):
    correct = 0
    for prediction,true_label in zip(predictions, y_test):
        if prediction==true_label:
            correct += 1
    return (correct/len(predictions))

In [4]:
df = pd.read_csv("data/books_def.csv", index_col=0)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36638 entries, Suzanne Collins to Mimi Baird|Eve Claxton
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   book_desc    36638 non-null  object 
 1   book_rating  36638 non-null  float64
 2   book_title   36638 non-null  object 
 3   genres       36638 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.4+ MB


In [29]:
df.head()

Unnamed: 0,book_authors,book_desc,book_rating,book_title,genres
0,Suzanne Collins,Winning will make you famous. Losing means cer...,4.33,The Hunger Games,Young Adult
1,J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,4.48,Harry Potter and the Order of the Phoenix,Fantasy
2,Harper Lee,The unforgettable novel of a childhood in a sl...,4.27,To Kill a Mockingbird,Classics
3,Stephenie Meyer,About three things I was absolutely positive.F...,3.58,Twilight,Young Adult
4,Markus Zusak,Trying to make sense of the horrors of World W...,4.36,The Book Thief,Historical


In [7]:
df = df.reset_index()

In [8]:
df["book_desc"]

0        Winning will make you famous. Losing means cer...
1        There is a door at the end of a silent corrido...
2        The unforgettable novel of a childhood in a sl...
3        About three things I was absolutely positive.F...
4        Trying to make sense of the horrors of World W...
                               ...                        
36633    A brilliant, provocative novel about an artist...
36634    Avi Steinberg is stumped. After defecting from...
36635    In this fearless and half-crazy story, Howard ...
36636    From the icons of the game to the players who ...
36637    Soon to be a major motion picture, from Brad P...
Name: book_desc, Length: 36638, dtype: object

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
# leggo testi & etichette
X = df["book_desc"]
y = df["genres"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=1)

In [11]:
sample_idx = 50
X_train[sample_idx]

'Winner of the 2007 BookBrowse Award for Most Popular Book.An atmospheric, gritty, and compelling novel of star-crossed lovers, set in the circus world circa 1932, by the bestselling author of Riding Lessons. When Jacob Jankowski, recently orphaned and suddenly adrift, jumps onto a passing train, he enters a world of freaks, drifters, and misfits, a second-rate circus struggling to survive during the Great Depression, making one-night stands in town after endless town. A veterinary student who almost earned his degree, Jacob is put in charge of caring for the circus menagerie. It is there that he meets Marlena, the beautiful young star of the equestrian act, who is married to August, the charismatic but twisted animal trainer. He also meets Rosie, an elephant who seems untrainable until he discovers a way to reach her. Beautifully written, Water for Elephants is illuminated by a wonderful sense of time and place. It tells a story of a love between two people that overcomes incredible o

In [12]:
y_train[sample_idx]


'Fiction'

In [15]:
stopword_list = stopwords.words('english')
punctuations = string.punctuation

def spacy_nlp_tokenizer(text):
    # substituting all space characters with a single space
    text = re.sub('\s+', ' ', text)
    text = re.sub('\r+', ' ', text)
    # removing Url
    text = re.sub(r"\S*https?:\S*", "", text, flags=re.MULTILINE)
    #removing mention
    text = re.sub(r'@[\w]+', "", text, flags=re.MULTILINE)
    doc = nlp(text)
    # lemmatizing tokens and lowering case
    lemmas = [token.lemma_.lower() for token in doc]
    
    # removing stopwords and punctuations
    lemmas_nostop = [token for token in lemmas if token not in stopword_list and token not in punctuations]

    # creating ngrams
    lemma_bigrams = ['BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(lemmas_nostop,2)]
    lemma_trigrams = ['TRI_'+p1+'_'+p2+'_'+p3 for p1,p2,p3 in nltk.ngrams(lemmas_nostop,3)]
    
    all_tokens = list()
    all_tokens.extend(lemmas_nostop)
    all_tokens.extend(lemma_bigrams)
    all_tokens.extend(lemma_trigrams)
    return all_tokens

In [16]:
with open('data/pickle/svm_train_tok.pkl',mode='br') as inputfile:
    X_train_tok = pickle.load(inputfile)
with open('data/pickle/svm_test_tok.pkl',mode='br') as inputfile:
    X_test_tok = pickle.load(inputfile)
with open('data/pickle/vect.pkl',mode='br') as inputfile:
    vect = pickle.load(inputfile)

In [18]:
print("Summary: ", len(X_train), "Genres: ", len(y_train))

Summary:  25646 Genres:  25646


In [19]:
print("Summary: ", len(X_test), "Genres: ", len(y_test))

Summary:  10992 Genres:  10992


In [20]:
print(set(y_test), "\n", set(y_train))

{'Historical', 'Nonfiction', 'Science Fiction', 'Fiction', 'Sequential Art', 'Fantasy', 'Mystery', 'Young Adult', 'Romance', 'Classics'} 
 {'Historical', 'Nonfiction', 'Science Fiction', 'Fiction', 'Sequential Art', 'Fantasy', 'Mystery', 'Young Adult', 'Romance', 'Classics'}


In [21]:
len(vect.vocabulary_)


69255

## Decision tree

In [27]:
dt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=1000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', DecisionTreeClassifier())  # learning algorithm
])

dt_pipeline.fit(X_train_tok,y_train)
predictions = dt_pipeline.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

       Classics       0.27      0.28      0.28       477
        Fantasy       0.44      0.44      0.44      1891
        Fiction       0.35      0.38      0.37      2134
     Historical       0.22      0.19      0.20       643
        Mystery       0.33      0.34      0.34       612
     Nonfiction       0.56      0.57      0.56      1951
        Romance       0.38      0.40      0.39      1362
Science Fiction       0.37      0.32      0.35       525
 Sequential Art       0.37      0.32      0.34       399
    Young Adult       0.26      0.25      0.26       998

       accuracy                           0.39     10992
      macro avg       0.36      0.35      0.35     10992
   weighted avg       0.39      0.39      0.39     10992

Confusion matrix:
[[ 134   41  124   31   17   84   14    6    6   20]
 [  41  835  247   73   62  149  198   68   55  163]
 [ 122  228  809  113   94  267  225   65   22  189]

In [28]:
eval_accuracy(y_test, predictions)

0.3914665211062591

In [24]:
dt_pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df = 5)), #tokenization
    ('sel', SelectKBest(chi2, k=7000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', DecisionTreeClassifier())  # learning algorithm
])

dt_pipeline.fit(X_train,y_train)
predictions = dt_pipeline.predict(X_test)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

       Classics       0.27      0.23      0.25       477
        Fantasy       0.39      0.42      0.40      1891
        Fiction       0.37      0.38      0.37      2134
     Historical       0.19      0.17      0.18       643
        Mystery       0.26      0.22      0.24       612
     Nonfiction       0.56      0.59      0.58      1951
        Romance       0.41      0.42      0.42      1362
Science Fiction       0.23      0.22      0.23       525
 Sequential Art       0.27      0.22      0.24       399
    Young Adult       0.28      0.27      0.27       998

       accuracy                           0.38     10992
      macro avg       0.32      0.31      0.32     10992
   weighted avg       0.37      0.38      0.37     10992

Confusion matrix:
[[ 110   51  119   34   10   92   26   12    6   17]
 [  46  787  271   74   66  164  192   72   54  165]
 [  77  276  818  133   79  282  170   95   46  158]

In [26]:
eval_accuracy(y_test, predictions)

0.37791120815138285