In [15]:
# preprocessing & visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import spacy
import pickle
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# models for classification
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier


In [16]:
def plot_cm(pipeline, X_test, y_test):
    plt.figure(figsize=(15,15))
    plot_confusion_matrix(pipeline, X_test, y_test)  
    plt.xticks(rotation=45, fontsize = 10)
    plt.yticks(rotation=0, fontsize = 10)
    plt.show()

In [17]:
df = pd.read_csv("data/books_def.csv", index_col=0)
fic_nonfic = ['Fiction', 'Nonfiction']
df_binary = df.loc[df['genres'].isin(fic_nonfic)]


In [18]:
nlp = spacy.load("en_core_web_sm")

In [19]:
# leggo testi & etichette
X = df["book_desc"]
y = df["genres"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=1)

In [6]:
with open('data/pickle/svm_train_tok.pkl',mode='br') as inputfile:
    X_train_tok = pickle.load(inputfile)
with open('data/pickle/svm_test_tok.pkl',mode='br') as inputfile:
    X_test_tok = pickle.load(inputfile)

In [7]:
RF_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier())  # learning algorithm
])

RF_pipeline.fit(X_train_tok,y_train)
predictions = RF_pipeline.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)



Classification report:
                 precision    recall  f1-score   support

       Classics       0.78      0.12      0.21       477
        Fantasy       0.55      0.70      0.61      1891
        Fiction       0.42      0.64      0.51      2134
     Historical       0.81      0.03      0.06       643
        Mystery       0.69      0.29      0.41       612
     Nonfiction       0.65      0.87      0.74      1951
        Romance       0.56      0.68      0.62      1362
Science Fiction       0.73      0.23      0.35       525
 Sequential Art       0.95      0.15      0.26       399
    Young Adult       0.56      0.26      0.36       998

       accuracy                           0.55     10992
      macro avg       0.67      0.40      0.41     10992
   weighted avg       0.60      0.55      0.51     10992

Confusion matrix:
[[  58   26  210    0    6  166    5    2    0    4]
 [   3 1325  248    0   12  107  136    9    2   49]
 [   7  218 1368    4   19  301  162   11    0   44]

In [8]:
param_grid = [{'sel__k': [3000, 5000, 7000],'learner__criterion': ["entropy", "gini"], 'learner__n_estimators': [100, 300, 500]}]

opt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier(bootstrap = True))  # learning algorithm
])

n_jobs = 5  #Number of jobs to run in parallel
opt_search = GridSearchCV(opt_pipeline, param_grid, cv=5, n_jobs = n_jobs, verbose=True).fit(X_train_tok,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed: 25.7min
[Parallel(n_jobs=5)]: Done  90 out of  90 | elapsed: 57.0min finished


In [9]:
opt_search.best_params_


{'learner__criterion': 'gini', 'learner__n_estimators': 500, 'sel__k': 3000}

In [10]:
opt_search.best_estimator_


Pipeline(steps=[('sel',
                 SelectKBest(k=3000,
                             score_func=<function chi2 at 0x0000025242C22EE0>)),
                ('tfidf', TfidfTransformer()),
                ('learner', RandomForestClassifier(n_estimators=500))])

In [11]:
opt_predictions = opt_search.best_estimator_.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, opt_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, opt_predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

       Classics       0.80      0.12      0.20       477
        Fantasy       0.56      0.71      0.62      1891
        Fiction       0.44      0.66      0.53      2134
     Historical       0.88      0.03      0.06       643
        Mystery       0.73      0.31      0.44       612
     Nonfiction       0.65      0.88      0.75      1951
        Romance       0.58      0.70      0.64      1362
Science Fiction       0.74      0.26      0.39       525
 Sequential Art       0.91      0.15      0.26       399
    Young Adult       0.59      0.28      0.38       998

       accuracy                           0.56     10992
      macro avg       0.69      0.41      0.43     10992
   weighted avg       0.62      0.56      0.52     10992

Confusion matrix:
[[  55   23  215    0    4  170    6    1    0    3]
 [   2 1337  231    0   11  110  145    9    4   42]
 [   6  204 1415    2   18  291  142   15    0   41]

In [12]:
RF_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=7000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier(n_estimators=500, criterion = "gini"))  # learning algorithm
])

RF_pipeline.fit(X_train_tok,y_train)
predictions = RF_pipeline.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)



Classification report:
                 precision    recall  f1-score   support

       Classics       0.79      0.11      0.19       477
        Fantasy       0.55      0.72      0.63      1891
        Fiction       0.43      0.66      0.52      2134
     Historical       0.94      0.03      0.05       643
        Mystery       0.74      0.29      0.41       612
     Nonfiction       0.66      0.89      0.76      1951
        Romance       0.59      0.70      0.64      1362
Science Fiction       0.78      0.22      0.34       525
 Sequential Art       0.60      0.13      0.22       399
    Young Adult       0.61      0.26      0.37       998

       accuracy                           0.56     10992
      macro avg       0.67      0.40      0.41     10992
   weighted avg       0.61      0.56      0.52     10992

Confusion matrix:
[[  53   28  218    0    3  162    7    1    2    3]
 [   2 1367  228    0   11  103  131    6    9   34]
 [   7  188 1413    1   18  299  146    9    5   48]

In [13]:
plot_cm(dt_pipeline, X_test, y_test)

NameError: name 'dt_pipeline' is not defined

In [7]:
X_binary = df_binary["book_desc"]
y_binary = df_binary["genres"]


X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_binary, y_binary,  test_size=0.30, random_state=1)

In [11]:
vect= TfidfVectorizer(max_df=0.8, max_features=10000)


In [12]:
print('fit')
# Just creating the features space. It define the dimensions.
vect.fit(X_train_binary) 
print('transform')
#Creating the vectors
X_train_tok_binary = vect.transform(X_train_binary)
print('done')
X_test_tok_binary = vect.transform(X_test_binary)

fit
transform
done


In [13]:
RF_pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df = 5)), #tokenization
    ('sel', SelectKBest(chi2, k=7000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier())  # learning algorithm
])

RF_pipeline.fit(X_train_binary, y_train_binary)
predictions_test_binary = RF_pipeline.predict(X_test_binary)
predictions_train_binary = RF_pipeline.predict(X_train_binary)
print('Classification report:')
print(classification_report(y_train_binary, predictions_train_binary))
print('Confusion matrix:')
cm = confusion_matrix(y_train_binary, predictions_train_binary)
print(cm)

Classification report:
              precision    recall  f1-score   support

     Fiction       1.00      1.00      1.00      4990
  Nonfiction       1.00      1.00      1.00      4542

    accuracy                           1.00      9532
   macro avg       1.00      1.00      1.00      9532
weighted avg       1.00      1.00      1.00      9532

Confusion matrix:
[[4987    3]
 [   1 4541]]


In [14]:
predictions_test_binary = RF_pipeline.predict(X_test_binary)
print('Classification report:')
print(classification_report(y_test_binary, predictions_test_binary))
print('Confusion matrix:')
cm = confusion_matrix(y_test_binary, predictions_test_binary)
print(cm)

Classification report:
              precision    recall  f1-score   support

     Fiction       0.85      0.92      0.89      2124
  Nonfiction       0.91      0.83      0.87      1962

    accuracy                           0.88      4086
   macro avg       0.88      0.88      0.88      4086
weighted avg       0.88      0.88      0.88      4086

Confusion matrix:
[[1960  164]
 [ 337 1625]]
