In [2]:
# preprocessing & visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import spacy
import pickle
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# models for classification
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier


In [3]:
def plot_cm(pipeline, X_test, y_test):
    plt.figure(figsize=(15,15))
    plot_confusion_matrix(pipeline, X_test, y_test)  
    plt.xticks(rotation=45, fontsize = 10)
    plt.yticks(rotation=0, fontsize = 10)
    plt.show()

In [8]:
df = pd.read_csv("data/books_def_small.csv", index_col=0)

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
# leggo testi & etichette
X = df["book_desc"]
y = df["genres"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=1)

In [11]:
with open('data/pickle/svm_train_tok.pkl',mode='br') as inputfile:
    X_train_tok = pickle.load(inputfile)
with open('data/pickle/svm_test_tok.pkl',mode='br') as inputfile:
    X_test_tok = pickle.load(inputfile)

In [12]:
RF_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier())  # learning algorithm
])

RF_pipeline.fit(X_train_tok,y_train)
predictions = RF_pipeline.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)



Classification report:
                 precision    recall  f1-score   support

       Classics       0.92      0.11      0.20       302
        Fantasy       0.51      0.81      0.62      1693
        Fiction       0.43      0.73      0.54      1464
     Historical       0.69      0.09      0.16       573
        Mystery       0.73      0.42      0.53       538
     Nonfiction       0.79      0.52      0.63       667
        Romance       0.59      0.66      0.62      1049
Science Fiction       0.82      0.31      0.45       444
 Sequential Art       0.96      0.27      0.42       386
    Young Adult       0.57      0.33      0.42       830

       accuracy                           0.54      7946
      macro avg       0.70      0.42      0.46      7946
   weighted avg       0.62      0.54      0.51      7946

Confusion matrix:
[[  33   46  192    3    2   12   10    2    0    2]
 [   0 1366  148    4   14   17  105    5    1   33]
 [   2  198 1068    6   27   23   99    4    1   36]

In [22]:
param_grid = [{'sel__k': [3000, 5000, 7000],'learner__criterion': ["entropy", "gini"], 'learner__n_estimators': [100, 300, 500]}]

opt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier(bootstrap = True))  # learning algorithm
])

n_jobs = 5  #Number of jobs to run in parallel
opt_search = GridSearchCV(opt_pipeline, param_grid, cv=5, n_jobs = n_jobs, verbose=True).fit(X_train_tok,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed: 28.2min
[Parallel(n_jobs=5)]: Done  90 out of  90 | elapsed: 64.5min finished


In [23]:
opt_search.best_params_


{'learner__criterion': 'gini', 'learner__n_estimators': 500, 'sel__k': 5000}

In [24]:
opt_search.best_estimator_


Pipeline(steps=[('sel',
                 SelectKBest(k=5000,
                             score_func=<function chi2 at 0x0000022897C189D0>)),
                ('tfidf', TfidfTransformer()),
                ('learner', RandomForestClassifier(n_estimators=500))])

In [25]:
opt_predictions = opt_search.best_estimator_.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, opt_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, opt_predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

       Classics       0.79      0.12      0.20       477
        Fantasy       0.56      0.72      0.63      1891
        Fiction       0.44      0.67      0.53      2134
     Historical       0.86      0.03      0.05       643
        Mystery       0.74      0.29      0.42       612
     Nonfiction       0.66      0.88      0.75      1951
        Romance       0.59      0.71      0.64      1362
Science Fiction       0.77      0.26      0.39       525
 Sequential Art       0.92      0.14      0.25       399
    Young Adult       0.63      0.27      0.38       998

       accuracy                           0.56     10992
      macro avg       0.69      0.41      0.42     10992
   weighted avg       0.62      0.56      0.52     10992

Confusion matrix:
[[  55   25  215    0    3  166    8    2    0    3]
 [   2 1364  237    0   10   98  136    7    4   33]
 [   7  197 1425    3   15  286  152   12    0   37]

In [13]:
RF_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=7000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier(n_estimators=500, criterion = "gini"))  # learning algorithm
])

RF_pipeline.fit(X_train_tok,y_train)
predictions = RF_pipeline.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)



Classification report:
                 precision    recall  f1-score   support

       Classics       0.97      0.12      0.21       302
        Fantasy       0.51      0.83      0.63      1693
        Fiction       0.45      0.76      0.57      1464
     Historical       0.67      0.06      0.11       573
        Mystery       0.77      0.42      0.54       538
     Nonfiction       0.82      0.56      0.66       667
        Romance       0.60      0.68      0.64      1049
Science Fiction       0.82      0.28      0.42       444
 Sequential Art       0.90      0.26      0.40       386
    Young Adult       0.59      0.34      0.43       830

       accuracy                           0.55      7946
      macro avg       0.71      0.43      0.46      7946
   weighted avg       0.63      0.55      0.52      7946

Confusion matrix:
[[  35   43  192    3    2   12   10    2    0    3]
 [   0 1407  126    3   11   13  103    2    3   25]
 [   1  173 1109    3   20   22  101    3    1   31]

In [14]:
plot_cm(dt_pipeline, X_test, y_test)

NameError: name 'dt_pipeline' is not defined