In [2]:
# preprocessing & utilities
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# text handling 
import nltk
import re
import spacy
import string
from nltk.corpus import stopwords

# models for classification
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem.wordnet import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
def plot_cm(pipeline, X_test, y_test):
    plt.figure(figsize=(15,15))
    plot_confusion_matrix(pipeline, X_test, y_test)  
    plt.xticks(rotation=45, fontsize = 10)
    plt.yticks(rotation=0, fontsize = 8)
    plt.title("Confusion matrix")
    plt.show()
    
def eval_accuracy (pred, test):
    correct = 0
    for prediction,true_label in zip(predictions, y_test):
        if prediction==true_label:
            correct += 1
    return (correct/len(predictions))

In [5]:
df = pd.read_csv("data/books_def.csv", index_col=0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36638 entries, Suzanne Collins to Mimi Baird|Eve Claxton
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   book_desc    36638 non-null  object 
 1   book_rating  36638 non-null  float64
 2   book_title   36638 non-null  object 
 3   genres       36638 non-null  object 
dtypes: float64(1), object(3)
memory usage: 1.4+ MB


In [7]:
df.head()

Unnamed: 0_level_0,book_desc,book_rating,book_title,genres
book_authors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Suzanne Collins,Winning will make you famous. Losing means cer...,4.33,The Hunger Games,Young Adult
J.K. Rowling|Mary GrandPré,There is a door at the end of a silent corrido...,4.48,Harry Potter and the Order of the Phoenix,Fantasy
Harper Lee,The unforgettable novel of a childhood in a sl...,4.27,To Kill a Mockingbird,Classics
Stephenie Meyer,About three things I was absolutely positive.F...,3.58,Twilight,Young Adult
Markus Zusak,Trying to make sense of the horrors of World W...,4.36,The Book Thief,Historical


In [8]:
df = df.reset_index()

In [9]:
df["book_desc"]

0        Winning will make you famous. Losing means cer...
1        There is a door at the end of a silent corrido...
2        The unforgettable novel of a childhood in a sl...
3        About three things I was absolutely positive.F...
4        Trying to make sense of the horrors of World W...
                               ...                        
36633    A brilliant, provocative novel about an artist...
36634    Avi Steinberg is stumped. After defecting from...
36635    In this fearless and half-crazy story, Howard ...
36636    From the icons of the game to the players who ...
36637    Soon to be a major motion picture, from Brad P...
Name: book_desc, Length: 36638, dtype: object

In [10]:
nlp = spacy.load("en_core_web_sm")

In [11]:
# leggo testi & etichette
X = df["book_desc"]
y = df["genres"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=1)

In [12]:
sample_idx = 50
X_train[sample_idx]

'Winner of the 2007 BookBrowse Award for Most Popular Book.An atmospheric, gritty, and compelling novel of star-crossed lovers, set in the circus world circa 1932, by the bestselling author of Riding Lessons. When Jacob Jankowski, recently orphaned and suddenly adrift, jumps onto a passing train, he enters a world of freaks, drifters, and misfits, a second-rate circus struggling to survive during the Great Depression, making one-night stands in town after endless town. A veterinary student who almost earned his degree, Jacob is put in charge of caring for the circus menagerie. It is there that he meets Marlena, the beautiful young star of the equestrian act, who is married to August, the charismatic but twisted animal trainer. He also meets Rosie, an elephant who seems untrainable until he discovers a way to reach her. Beautifully written, Water for Elephants is illuminated by a wonderful sense of time and place. It tells a story of a love between two people that overcomes incredible o

In [13]:
y_train[sample_idx]


'Fiction'

In [14]:
stopword_list = stopwords.words('english')
punctuations = string.punctuation

def spacy_nlp_tokenizer(text):
    # substituting all space characters with a single space
    text = re.sub('\s+', ' ', text)
    text = re.sub('\r+', ' ', text)
    # removing Url
    text = re.sub(r"\S*https?:\S*", "", text, flags=re.MULTILINE)
    #removing mention
    text = re.sub(r'@[\w]+', "", text, flags=re.MULTILINE)
    doc = nlp(text)
    # lemmatizing tokens and lowering case
    lemmas = [token.lemma_.lower() for token in doc]
    
    # removing stopwords and punctuations
    lemmas_nostop = [token for token in lemmas if token not in stopword_list and token not in punctuations]

    # creating ngrams
    lemma_bigrams = ['BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(lemmas_nostop,2)]
    lemma_trigrams = ['TRI_'+p1+'_'+p2+'_'+p3 for p1,p2,p3 in nltk.ngrams(lemmas_nostop,3)]
    
    all_tokens = list()
    all_tokens.extend(lemmas_nostop)
    all_tokens.extend(lemma_bigrams)
    all_tokens.extend(lemma_trigrams)
    return all_tokens

In [15]:
with open('data/pickle/svm_train_tok.pkl',mode='br') as inputfile:
    X_train_tok = pickle.load(inputfile)
with open('data/pickle/svm_test_tok.pkl',mode='br') as inputfile:
    X_test_tok = pickle.load(inputfile)
with open('data/pickle/vect.pkl',mode='br') as inputfile:
    vect = pickle.load(inputfile)

In [15]:
print("Summary: ", len(X_train), "Genres: ", len(y_train))

Summary:  25646 Genres:  25646


In [16]:
print("Summary: ", len(X_test), "Genres: ", len(y_test))

Summary:  10992 Genres:  10992


In [17]:
print(set(y_test), "\n", set(y_train))

{'Fiction', 'Fantasy', 'Nonfiction', 'Romance', 'Science Fiction', 'Historical', 'Mystery', 'Classics', 'Sequential Art', 'Young Adult'} 
 {'Fiction', 'Romance', 'Nonfiction', 'Fantasy', 'Science Fiction', 'Historical', 'Mystery', 'Classics', 'Sequential Art', 'Young Adult'}


In [18]:
len(vect.vocabulary_)


10000

## Decision tree

In [19]:
dt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=1000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', DecisionTreeClassifier())  # learning algorithm
])

dt_pipeline.fit(X_train_tok,y_train)
predictions = dt_pipeline.predict(X_test_tok)
predictions_train = dt_pipeline.predict(X_train_tok)


In [20]:
eval_accuracy(y_test, predictions)

0.3913755458515284

In [21]:
print('Classification report training :')
print(classification_report(y_train, predictions_train))

print('Confusion matrix training :')
cm = confusion_matrix(y_train, predictions_train)
print(cm)

Classification report training :
                 precision    recall  f1-score   support

       Classics       0.99      1.00      0.99      1111
        Fantasy       1.00      1.00      1.00      4412
        Fiction       1.00      1.00      1.00      4980
     Historical       1.00      1.00      1.00      1501
        Mystery       1.00      1.00      1.00      1429
     Nonfiction       1.00      1.00      1.00      4553
        Romance       1.00      1.00      1.00      3177
Science Fiction       1.00      1.00      1.00      1225
 Sequential Art       1.00      1.00      1.00       931
    Young Adult       1.00      1.00      1.00      2327

       accuracy                           1.00     25646
      macro avg       1.00      1.00      1.00     25646
   weighted avg       1.00      1.00      1.00     25646

Confusion matrix training :
[[1109    0    0    1    0    1    0    0    0    0]
 [   2 4404    0    1    0    5    0    0    0    0]
 [   6    1 4964    1    0    7 

In [22]:
print('Classification report:')
print(classification_report(y_test, predictions))

print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

       Classics       0.28      0.25      0.27       477
        Fantasy       0.42      0.43      0.43      1891
        Fiction       0.35      0.37      0.36      2134
     Historical       0.21      0.20      0.20       643
        Mystery       0.32      0.27      0.29       612
     Nonfiction       0.57      0.63      0.60      1951
        Romance       0.41      0.40      0.40      1362
Science Fiction       0.31      0.27      0.29       525
 Sequential Art       0.28      0.23      0.26       399
    Young Adult       0.28      0.28      0.28       998

       accuracy                           0.39     10992
      macro avg       0.34      0.33      0.34     10992
   weighted avg       0.39      0.39      0.39     10992

Confusion matrix:
[[ 121   39  121   24   10  102   22   13   13   12]
 [  46  813  266   73   67  153  207   66   51  149]
 [  84  263  796  136   84  297  199   71   42  162]

In [23]:
dt_pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df = 5)), #tokenization
    ('sel', SelectKBest(chi2, k="all")),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', DecisionTreeClassifier())  # learning algorithm
])

dt_pipeline.fit(X_train,y_train)
predictions_train = dt_pipeline.predict(X_train)
predictions = dt_pipeline.predict(X_test)


In [24]:
eval_accuracy(y_test, predictions)

0.36872270742358076

In [25]:
print('Classification report training :')
print(classification_report(y_train, predictions_train))

print('Confusion matrix training :')
cm = confusion_matrix(y_train, predictions_train)
print(cm)

Classification report training :
                 precision    recall  f1-score   support

       Classics       0.99      1.00      0.99      1111
        Fantasy       1.00      1.00      1.00      4412
        Fiction       1.00      1.00      1.00      4980
     Historical       0.99      1.00      1.00      1501
        Mystery       1.00      1.00      1.00      1429
     Nonfiction       1.00      1.00      1.00      4553
        Romance       1.00      1.00      1.00      3177
Science Fiction       1.00      1.00      1.00      1225
 Sequential Art       1.00      1.00      1.00       931
    Young Adult       1.00      1.00      1.00      2327

       accuracy                           1.00     25646
      macro avg       1.00      1.00      1.00     25646
   weighted avg       1.00      1.00      1.00     25646

Confusion matrix training :
[[1110    0    0    1    0    0    0    0    0    0]
 [   2 4409    0    1    0    0    0    0    0    0]
 [   6    1 4972    1    0    0 

In [26]:

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

       Classics       0.25      0.25      0.25       477
        Fantasy       0.39      0.39      0.39      1891
        Fiction       0.35      0.37      0.36      2134
     Historical       0.16      0.15      0.16       643
        Mystery       0.26      0.24      0.25       612
     Nonfiction       0.56      0.59      0.58      1951
        Romance       0.41      0.39      0.40      1362
Science Fiction       0.24      0.25      0.24       525
 Sequential Art       0.24      0.22      0.23       399
    Young Adult       0.26      0.25      0.25       998

       accuracy                           0.37     10992
      macro avg       0.31      0.31      0.31     10992
   weighted avg       0.36      0.37      0.37     10992

Confusion matrix:
[[ 118   41  117   23   18   96   15   15   15   19]
 [  53  746  305   84   80  166  165   87   49  156]
 [ 111  260  785  140   89  271  178   97   55  148]

In [27]:
tokenizer = vect
selector = dt_pipeline.named_steps['sel']
classifier = dt_pipeline.named_steps['learner']

In [28]:
feature_names = tokenizer.get_feature_names()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(selector.get_support(),selector.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score)
len(feats_w_score)

IndexError: list index out of range

In [None]:
feats_w_score[-100:]



In [None]:
feats_w_score[-100]

In [17]:
fic_nonfic = ['Fiction', 'Nonfiction']
df_binary = df.loc[df['genres'].isin(fic_nonfic)]


In [18]:
X_binary = df_binary["book_desc"]
y_binary = df_binary["genres"]


X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_binary, y_binary,  test_size=0.30, random_state=1)

In [19]:
vect= TfidfVectorizer(max_df=0.8, max_features=10000)


In [20]:
print('fit')
# Just creating the features space. It define the dimensions.
vect.fit(X_train_binary) 
print('transform')
#Creating the vectors
X_train_tok_binary = vect.transform(X_train_binary)
print('done')
X_test_tok_binary = vect.transform(X_test_binary)

fit
transform
done


In [21]:
RF_pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df = 5)), #tokenization
    ('sel', SelectKBest(chi2, k=7000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier())  # learning algorithm
])

RF_pipeline.fit(X_train_binary, y_train_binary)
predictions_test_binary = RF_pipeline.predict(X_test_binary)
predictions_train_binary = RF_pipeline.predict(X_train_binary)
print('Classification report:')
print(classification_report(y_train_binary, predictions_train_binary))
print('Confusion matrix:')
cm = confusion_matrix(y_train_binary, predictions_train_binary)
print(cm)

Classification report:
              precision    recall  f1-score   support

     Fiction       1.00      1.00      1.00      4990
  Nonfiction       1.00      1.00      1.00      4542

    accuracy                           1.00      9532
   macro avg       1.00      1.00      1.00      9532
weighted avg       1.00      1.00      1.00      9532

Confusion matrix:
[[4988    2]
 [   2 4540]]


In [22]:
predictions_test_binary = RF_pipeline.predict(X_test_binary)
print('Classification report:')
print(classification_report(y_test_binary, predictions_test_binary))
print('Confusion matrix:')
cm = confusion_matrix(y_test_binary, predictions_test_binary)
print(cm)

Classification report:
              precision    recall  f1-score   support

     Fiction       0.85      0.92      0.88      2124
  Nonfiction       0.90      0.82      0.86      1962

    accuracy                           0.87      4086
   macro avg       0.88      0.87      0.87      4086
weighted avg       0.88      0.87      0.87      4086

Confusion matrix:
[[1952  172]
 [ 348 1614]]
