In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# text handling 
import nltk
import re
import spacy
import string
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('data/books_def.csv')

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem.wordnet import WordNetLemmatizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Erica\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
df = df.reset_index()

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
# leggo testi & etichette
X = df["book_desc"]
y = df["genres"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=1)

In [7]:
stopword_list = stopwords.words('english')
punctuations = string.punctuation

def spacy_nlp_tokenizer(text):
    # substituting all space characters with a single space
    text = re.sub('\s+', ' ', text)
    text = re.sub('\r+', ' ', text)
    # removing Url
    text = re.sub(r"\S*https?:\S*", "", text, flags=re.MULTILINE)
    #removing mention
    text = re.sub(r'@[\w]+', "", text, flags=re.MULTILINE)
    doc = nlp(text)
    # lemmatizing tokens and lowering case
    lemmas = [token.lemma_.lower() for token in doc]
    
    # removing stopwords and punctuations
    lemmas_nostop = [token for token in lemmas if token not in stopword_list and token not in punctuations]

    # creating ngrams
    lemma_bigrams = ['BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(lemmas_nostop,2)]
    lemma_trigrams = ['TRI_'+p1+'_'+p2+'_'+p3 for p1,p2,p3 in nltk.ngrams(lemmas_nostop,3)]
    
    all_tokens = list()
    all_tokens.extend(lemmas_nostop)
    all_tokens.extend(lemma_bigrams)
    all_tokens.extend(lemma_trigrams)
    return all_tokens

In [None]:
# vect=TfidfVectorizer(analyzer=spacy_nlp_tokenizer, min_df = 5)
# print('fit')
# # Just creating the features space. It define the dimensions.
# vect.fit(X_train) 
# print('transform')
# #Creating the vectors
# X_train_tok = vect.transform(X_train)
# print('done')
# X_test_tok = vect.transform(X_test)

In [10]:
with open('data/pickle/svm_train_tok.pkl',mode='br') as inputfile:
    X_train_tok = pickle.load(inputfile)
with open('data/pickle/svm_test_tok.pkl',mode='br') as inputfile:
    X_test_tok = pickle.load(inputfile)
with open('data/pickle/vect.pkl',mode='br') as inputfile:
    vect = pickle.load(inputfile)

In [11]:
lr = LogisticRegression(max_iter=5000)
lr.fit(X_train_tok, y_train)

LogisticRegression(max_iter=5000)

In [12]:
train_pred = lr.predict(X_train_tok)
test_pred = lr.predict(X_test_tok)

In [13]:
print(classification_report(y_train, train_pred))

                 precision    recall  f1-score   support

       Classics       0.99      1.00      0.99      1111
        Fantasy       1.00      1.00      1.00      4412
        Fiction       1.00      1.00      1.00      4980
     Historical       1.00      1.00      1.00      1501
        Mystery       1.00      1.00      1.00      1429
     Nonfiction       1.00      1.00      1.00      4553
        Romance       1.00      1.00      1.00      3177
Science Fiction       1.00      1.00      1.00      1225
 Sequential Art       1.00      1.00      1.00       931
    Young Adult       1.00      1.00      1.00      2327

       accuracy                           1.00     25646
      macro avg       1.00      1.00      1.00     25646
   weighted avg       1.00      1.00      1.00     25646



In [14]:
print(classification_report(y_test, test_pred))

                 precision    recall  f1-score   support

       Classics       0.53      0.47      0.50       477
        Fantasy       0.64      0.67      0.65      1891
        Fiction       0.51      0.54      0.53      2134
     Historical       0.49      0.44      0.47       643
        Mystery       0.61      0.54      0.58       612
     Nonfiction       0.78      0.80      0.79      1951
        Romance       0.65      0.67      0.66      1362
Science Fiction       0.61      0.52      0.56       525
 Sequential Art       0.66      0.53      0.59       399
    Young Adult       0.48      0.52      0.50       998

       accuracy                           0.61     10992
      macro avg       0.60      0.57      0.58     10992
   weighted avg       0.61      0.61      0.61     10992



In [15]:
fic_nonfic = ['Fiction', 'Nonfiction']
df_binary = df.loc[df['genres'].isin(fic_nonfic)]


In [16]:
X_binary = df_binary["book_desc"]
y_binary = df_binary["genres"]


X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_binary, y_binary,  test_size=0.30, random_state=1)

In [17]:
print('fit')
# Just creating the features space. It define the dimensions.
vect.fit(X_train_binary) 
print('transform')
#Creating the vectors
X_train_tok_binary = vect.transform(X_train_binary)
print('done')
X_test_tok_binary = vect.transform(X_test_binary)

fit
transform
done


In [18]:
lr = LogisticRegression(max_iter=5000)
lr.fit(X_train_tok_binary, y_train_binary)

LogisticRegression(max_iter=5000)

In [28]:
train_pred = lr.predict(X_train_tok_binary)
test_pred = lr.predict(X_test_tok_binary)

In [29]:
print(classification_report(y_train_binary, train_pred))

              precision    recall  f1-score   support

     Fiction       0.95      0.96      0.95      4990
  Nonfiction       0.95      0.94      0.95      4542

    accuracy                           0.95      9532
   macro avg       0.95      0.95      0.95      9532
weighted avg       0.95      0.95      0.95      9532



In [21]:
print(classification_report(y_test_binary, test_pred))

              precision    recall  f1-score   support

     Fiction       0.89      0.92      0.90      2124
  Nonfiction       0.91      0.88      0.89      1962

    accuracy                           0.90      4086
   macro avg       0.90      0.90      0.90      4086
weighted avg       0.90      0.90      0.90      4086

