In [25]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, f1_score
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from bs4 import BeautifulSoup
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
import multiprocessing
from sklearn import model_selection, svm
cores = multiprocessing.cpu_count()


In [73]:
df = pd.read_csv("books_def.csv", index_col=0)
df = df.reset_index()
for col in ['book_authors', 'book_rating', 'book_title']:
    del df[col]

In [74]:
df = df[(df['genres'] == 'Fiction') | (df['genres'] == 'Nonfiction')]

In [75]:
df = df.reset_index()
del df['index']

In [76]:
df

Unnamed: 0,book_desc,genres
0,An ingenious code hidden in the works of Leona...,Fiction
1,"A literary sensation and runaway bestseller, t...",Fiction
2,"﻿Written in his distinctively dazzling manner,...",Fiction
3,Paulo Coelho's masterpiece tells the mystical ...,Fiction
4,Be prepared to meet three unforgettable women:...,Fiction
...,...,...
13613,"A brilliant, provocative novel about an artist...",Fiction
13614,Avi Steinberg is stumped. After defecting from...,Nonfiction
13615,"In this fearless and half-crazy story, Howard ...",Nonfiction
13616,From the icons of the game to the players who ...,Nonfiction


In [77]:
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text
df['book_desc'] = df['book_desc'].apply(cleanText)

In [78]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens


In [79]:
# tags=[r.genres] per motivi ignoti SI PRENDE L'INDICE
df['to_ind'] = df['genres']
df = df.set_index('to_ind')

In [80]:
df

Unnamed: 0_level_0,book_desc,genres
to_ind,Unnamed: 1_level_1,Unnamed: 2_level_1
Fiction,an ingenious code hidden in the works of leona...,Fiction
Fiction,"a literary sensation and runaway bestseller, t...",Fiction
Fiction,"﻿written in his distinctively dazzling manner,...",Fiction
Fiction,paulo coelho's masterpiece tells the mystical ...,Fiction
Fiction,be prepared to meet three unforgettable women:...,Fiction
...,...,...
Fiction,"a brilliant, provocative novel about an artist...",Fiction
Nonfiction,avi steinberg is stumped. after defecting from...,Nonfiction
Nonfiction,"in this fearless and half-crazy story, howard ...",Nonfiction
Nonfiction,from the icons of the game to the players who ...,Nonfiction


In [81]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

train_tagged = train.apply(lambda r: TaggedDocument(words=tokenize_text(r['book_desc']), tags=[r.genres]), axis=1)

test_tagged = test.apply(lambda r: TaggedDocument(words=tokenize_text(r['book_desc']), tags=[r.genres]), axis=1)

In [82]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs=200)) for doc in sents])
    return targets, regressors

Distributed Bag of Words

In [83]:
#model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
#model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

In [84]:
#%%time
#for epoch in range(30):
 #   model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
  #  model_dbow.alpha -= 0.002
   # model_dbow.min_alpha = model_dbow.alpha

In [85]:
#y_train, X_train = vec_for_learning(model_dbow, train_tagged)
#y_test, X_test = vec_for_learning(model_dbow, test_tagged)


In [86]:

#logreg = LogisticRegression(n_jobs=1, C=1e5,  max_iter=10000))
#logreg.fit(X_train, y_train)

#y_pred = logreg.predict(X_test)

#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


In [87]:

#SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
#SVM.fit(X_train, y_train)

#y_pred = SVM.predict(X_test)

#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


In [88]:
#clf = RandomForestClassifier(bootstrap=True)

#clf = rand_search.best_estimator_

#clf = clf.fit(X_train, y_train)
#y_pred = clf.predict(X_test)
#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Distributed Memory (DM)

In [89]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=5, workers=15, alpha=0.065, min_alpha=0.065)
#model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=5, workers=15, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 9532/9532 [00:00<00:00, 6409122.43it/s]


In [90]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 9532/9532 [00:00<00:00, 6153625.63it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7077377.54it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7311650.65it/s]
100%|██████████| 9532/9532 [00:00<00:00, 6560568.71it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7303636.41it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7505182.23it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7333108.17it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7422968.01it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7457583.61it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7510822.04it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7517883.74it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7481307.21it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7523542.67it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7499550.88it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7416083.42it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7372322.65it/s]
100%|██████████| 9532/9532 [00:00<00:00, 7551965.57it/s]
100%|██████████| 9532/9532 [00:

CPU times: user 1min 35s, sys: 6.22 s, total: 1min 41s
Wall time: 29.4 s


In [91]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

In [92]:

SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train, y_train)

y_pred = SVM.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.8409202153695545
Testing F1 score: 0.8410255395701087


In [94]:
logreg = LogisticRegression(n_jobs=1, C=1e5,  max_iter=10000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.8470386686245717
Testing F1 score: 0.8471158575508124


In [95]:
clf = RandomForestClassifier(bootstrap=True)

clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.7440039158100832
Testing F1 score: 0.7426584809804719


Combined!

In [22]:
#model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
#model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [23]:
#new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [24]:
#def get_vectors(model, tagged_docs):
#    sents = tagged_docs.values
 #   targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
 #   return targets, regressors

In [25]:
#y_train, X_train = get_vectors(new_model, train_tagged)
#y_test, X_test = get_vectors(new_model, test_tagged)

In [26]:

#SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
#SVM.fit(X_train, y_train)

#y_pred = SVM.predict(X_test)

#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.1826783114992722
Testing F1 score: 0.13060293283431898


In [27]:

#logreg.fit(X_train, y_train)
#y_pred = logreg.predict(X_test)
#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.12527292576419213
Testing F1 score: 0.11406939588161567


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
#clf = RandomForestClassifier(bootstrap=True)

#clf = clf.fit(X_train, y_train)
#y_pred = clf.predict(X_test)
#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.18340611353711792
Testing F1 score: 0.1421811719743764
