In [3]:
# preprocessing & visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import spacy
import pickle
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# models for classification
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier


In [4]:
def plot_cm(pipeline, X_test, y_test):
    plt.figure(figsize=(15,15))
    plot_confusion_matrix(pipeline, X_test, y_test)  
    plt.xticks(rotation=45, fontsize = 10)
    plt.yticks(rotation=0, fontsize = 10)
    plt.show()

In [5]:
df = pd.read_csv("data/books_def.csv", index_col=0)

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
# leggo testi & etichette
X = df["book_desc"]
y = df["genres"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=1)

In [8]:
with open('data/pickle/svm_train_tok.pkl',mode='br') as inputfile:
    X_train_tok = pickle.load(inputfile)
with open('data/pickle/svm_test_tok.pkl',mode='br') as inputfile:
    X_test_tok = pickle.load(inputfile)

In [9]:
RF_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier())  # learning algorithm
])

RF_pipeline.fit(X_train_tok,y_train)
predictions = RF_pipeline.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)



Classification report:
                 precision    recall  f1-score   support

       Classics       0.81      0.16      0.26       477
        Fantasy       0.55      0.75      0.63      1891
        Fiction       0.44      0.63      0.52      2134
     Historical       0.83      0.07      0.13       643
        Mystery       0.65      0.44      0.52       612
     Nonfiction       0.68      0.84      0.75      1951
        Romance       0.59      0.65      0.62      1362
Science Fiction       0.71      0.31      0.44       525
 Sequential Art       0.88      0.18      0.30       399
    Young Adult       0.59      0.32      0.41       998

       accuracy                           0.57     10992
      macro avg       0.67      0.43      0.46     10992
   weighted avg       0.61      0.57      0.54     10992

Confusion matrix:
[[  75   36  212    1    4  137    6    2    1    3]
 [   2 1420  203    0   16   73  120   13    5   39]
 [  11  240 1349    5   45  256  150   21    0   57]

In [10]:
param_grid = [{'sel__k': [3000, 5000, 7000],'learner__criterion': ["entropy", "gini"], 'learner__n_estimators': [100, 300, 500]}]

opt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier(bootstrap = True))  # learning algorithm
])

n_jobs = 5  #Number of jobs to run in parallel
opt_search = GridSearchCV(opt_pipeline, param_grid, cv=5, n_jobs = n_jobs, verbose=True).fit(X_train_tok,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed: 25.1min
[Parallel(n_jobs=5)]: Done  90 out of  90 | elapsed: 56.5min finished


In [11]:
opt_search.best_params_


{'learner__criterion': 'gini', 'learner__n_estimators': 500, 'sel__k': 7000}

In [12]:
opt_search.best_estimator_


Pipeline(steps=[('sel',
                 SelectKBest(k=7000,
                             score_func=<function chi2 at 0x0000019A15962A60>)),
                ('tfidf', TfidfTransformer()),
                ('learner', RandomForestClassifier(n_estimators=500))])

In [13]:
opt_predictions = opt_search.best_estimator_.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, opt_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, opt_predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

       Classics       0.80      0.16      0.27       477
        Fantasy       0.55      0.76      0.64      1891
        Fiction       0.44      0.63      0.52      2134
     Historical       0.92      0.05      0.10       643
        Mystery       0.67      0.43      0.52       612
     Nonfiction       0.68      0.87      0.76      1951
        Romance       0.61      0.67      0.64      1362
Science Fiction       0.74      0.33      0.45       525
 Sequential Art       0.87      0.17      0.28       399
    Young Adult       0.60      0.34      0.43       998

       accuracy                           0.58     10992
      macro avg       0.69      0.44      0.46     10992
   weighted avg       0.63      0.58      0.54     10992

Confusion matrix:
[[  78   33  209    1    5  141    5    2    1    2]
 [   4 1431  192    0   15   80  115   13    4   37]
 [  10  220 1345    2   45  272  153   20    0   67]

In [None]:
RF_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=7000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier(n_estimators=500, criterion = "gini"))  # learning algorithm
])

RF_pipeline.fit(X_train_tok,y_train)
predictions = RF_pipeline.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)



In [None]:
plot_cm(dt_pipeline, X_test, y_test)