In [1]:
# preprocessing & visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import spacy
import pickle
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# models for classification
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier


In [2]:
def plot_cm(pipeline, X_test, y_test):
    plt.figure(figsize=(15,15))
    plot_confusion_matrix(pipeline, X_test, y_test)  
    plt.xticks(rotation=45, fontsize = 10)
    plt.yticks(rotation=0, fontsize = 10)
    plt.show()

In [3]:
df = pd.read_csv("data/books_def.csv", index_col=0)
fic_nonfic = ['Fiction', 'Nonfiction']
df_binary = df.loc[df['genres'].isin(fic_nonfic)]


In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
# leggo testi & etichette
X = df["book_desc"]
y = df["genres"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=1)

In [6]:
with open('data/pickle/svm_train_tok.pkl',mode='br') as inputfile:
    X_train_tok = pickle.load(inputfile)
with open('data/pickle/svm_test_tok.pkl',mode='br') as inputfile:
    X_test_tok = pickle.load(inputfile)

In [7]:
RF_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier())  # learning algorithm
])

RF_pipeline.fit(X_train_tok,y_train)
predictions = RF_pipeline.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)



Classification report:
                 precision    recall  f1-score   support

       Classics       0.80      0.13      0.22       477
        Fantasy       0.54      0.71      0.61      1891
        Fiction       0.43      0.63      0.51      2134
     Historical       0.87      0.05      0.10       643
        Mystery       0.70      0.31      0.43       612
     Nonfiction       0.65      0.88      0.74      1951
        Romance       0.57      0.68      0.62      1362
Science Fiction       0.73      0.24      0.36       525
 Sequential Art       0.90      0.13      0.23       399
    Young Adult       0.57      0.28      0.38       998

       accuracy                           0.55     10992
      macro avg       0.68      0.40      0.42     10992
   weighted avg       0.61      0.55      0.51     10992

Confusion matrix:
[[  61   35  201    0    3  167    5    1    0    4]
 [   2 1337  214    1   13  101  161   10    3   49]
 [   8  225 1343    3   26  305  156   14    1   53]

In [8]:
param_grid = [{'sel__k': [3000, 5000, 7000],'learner__criterion': ["entropy", "gini"], 'learner__n_estimators': [100, 300, 500]}]

opt_pipeline = Pipeline([
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier(bootstrap = True))  # learning algorithm
])

n_jobs = 5  #Number of jobs to run in parallel
opt_search = GridSearchCV(opt_pipeline, param_grid, cv=5, n_jobs = n_jobs, verbose=True).fit(X_train_tok,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed: 18.9min
[Parallel(n_jobs=5)]: Done  90 out of  90 | elapsed: 43.7min finished


In [9]:
opt_search.best_params_


{'learner__criterion': 'gini', 'learner__n_estimators': 500, 'sel__k': 3000}

In [10]:
opt_search.best_estimator_


Pipeline(steps=[('sel',
                 SelectKBest(k=3000,
                             score_func=<function chi2 at 0x000001869C257F70>)),
                ('tfidf', TfidfTransformer()),
                ('learner', RandomForestClassifier(n_estimators=500))])

In [11]:
opt_predictions = opt_search.best_estimator_.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, opt_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, opt_predictions)
print(cm)

Classification report:
                 precision    recall  f1-score   support

       Classics       0.80      0.15      0.25       477
        Fantasy       0.56      0.70      0.62      1891
        Fiction       0.44      0.64      0.52      2134
     Historical       0.89      0.06      0.12       643
        Mystery       0.72      0.37      0.49       612
     Nonfiction       0.65      0.87      0.74      1951
        Romance       0.58      0.70      0.63      1362
Science Fiction       0.72      0.30      0.43       525
 Sequential Art       0.90      0.18      0.30       399
    Young Adult       0.57      0.30      0.40       998

       accuracy                           0.56     10992
      macro avg       0.68      0.43      0.45     10992
   weighted avg       0.61      0.56      0.53     10992

Confusion matrix:
[[  72   25  208    0    5  152    8    3    0    4]
 [   4 1326  230    1   15  103  147   13    4   48]
 [   8  201 1356    2   26  299  167   20    1   54]

In [12]:
RF_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=7000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier(n_estimators=500, criterion = "gini"))  # learning algorithm
])

RF_pipeline.fit(X_train_tok,y_train)
predictions = RF_pipeline.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)



Classification report:
                 precision    recall  f1-score   support

       Classics       0.78      0.10      0.18       477
        Fantasy       0.56      0.71      0.63      1891
        Fiction       0.42      0.65      0.51      2134
     Historical       1.00      0.02      0.05       643
        Mystery       0.75      0.30      0.42       612
     Nonfiction       0.65      0.89      0.75      1951
        Romance       0.58      0.71      0.64      1362
Science Fiction       0.77      0.24      0.37       525
 Sequential Art       0.93      0.13      0.23       399
    Young Adult       0.61      0.26      0.37       998

       accuracy                           0.56     10992
      macro avg       0.71      0.40      0.42     10992
   weighted avg       0.63      0.56      0.51     10992

Confusion matrix:
[[  50   28  218    0    3  166    7    1    0    4]
 [   3 1347  230    0   11  112  144    7    2   35]
 [   7  204 1390    0   19  305  156   11    0   42]

In [14]:
X_binary = df_binary["book_desc"]
y_binary = df_binary["genres"]


X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_binary, y_binary,  test_size=0.30, random_state=1)

In [15]:
vect= TfidfVectorizer(max_df=0.8, max_features=10000)


In [16]:
print('fit')
# Just creating the features space. It define the dimensions.
vect.fit(X_train_binary) 
print('transform')
#Creating the vectors
X_train_tok_binary = vect.transform(X_train_binary)
print('done')
X_test_tok_binary = vect.transform(X_test_binary)

fit
transform
done


In [17]:
RF_pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df = 5)), #tokenization
    ('sel', SelectKBest(chi2, k=7000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier())  # learning algorithm
])

RF_pipeline.fit(X_train_binary, y_train_binary)
predictions_test_binary = RF_pipeline.predict(X_test_binary)
predictions_train_binary = RF_pipeline.predict(X_train_binary)
print('Classification report:')
print(classification_report(y_train_binary, predictions_train_binary))
print('Confusion matrix:')
cm = confusion_matrix(y_train_binary, predictions_train_binary)
print(cm)

Classification report:
              precision    recall  f1-score   support

     Fiction       1.00      1.00      1.00      4990
  Nonfiction       1.00      1.00      1.00      4542

    accuracy                           1.00      9532
   macro avg       1.00      1.00      1.00      9532
weighted avg       1.00      1.00      1.00      9532

Confusion matrix:
[[4989    1]
 [   3 4539]]


In [18]:
predictions_test_binary = RF_pipeline.predict(X_test_binary)
print('Classification report:')
print(classification_report(y_test_binary, predictions_test_binary))
print('Confusion matrix:')
cm = confusion_matrix(y_test_binary, predictions_test_binary)
print(cm)

Classification report:
              precision    recall  f1-score   support

     Fiction       0.85      0.92      0.88      2124
  Nonfiction       0.90      0.82      0.86      1962

    accuracy                           0.87      4086
   macro avg       0.88      0.87      0.87      4086
weighted avg       0.87      0.87      0.87      4086

Confusion matrix:
[[1945  179]
 [ 344 1618]]
