In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, f1_score
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from bs4 import BeautifulSoup
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
import multiprocessing
from sklearn import model_selection, svm
cores = multiprocessing.cpu_count()


In [2]:
df = pd.read_csv("books_def.csv", index_col=0)
df = df.reset_index()
for col in ['book_authors', 'book_rating', 'book_title']:
    del df[col]

In [3]:
df

Unnamed: 0,book_desc,genres
0,Winning will make you famous. Losing means cer...,Young Adult
1,There is a door at the end of a silent corrido...,Fantasy
2,The unforgettable novel of a childhood in a sl...,Classics
3,About three things I was absolutely positive.F...,Young Adult
4,Trying to make sense of the horrors of World W...,Historical
...,...,...
36633,"A brilliant, provocative novel about an artist...",Fiction
36634,Avi Steinberg is stumped. After defecting from...,Nonfiction
36635,"In this fearless and half-crazy story, Howard ...",Nonfiction
36636,From the icons of the game to the players who ...,Nonfiction


In [4]:
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text
df['book_desc'] = df['book_desc'].apply(cleanText)

In [5]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens


In [6]:
# tags=[r.genres] per motivi ignoti SI PRENDE L'INDICE
df['to_ind'] = df['genres']
df = df.set_index('to_ind')

In [7]:
df

Unnamed: 0_level_0,book_desc,genres
to_ind,Unnamed: 1_level_1,Unnamed: 2_level_1
Young Adult,winning will make you famous. losing means cer...,Young Adult
Fantasy,there is a door at the end of a silent corrido...,Fantasy
Classics,the unforgettable novel of a childhood in a sl...,Classics
Young Adult,about three things i was absolutely positive.f...,Young Adult
Historical,trying to make sense of the horrors of world w...,Historical
...,...,...
Fiction,"a brilliant, provocative novel about an artist...",Fiction
Nonfiction,avi steinberg is stumped. after defecting from...,Nonfiction
Nonfiction,"in this fearless and half-crazy story, howard ...",Nonfiction
Nonfiction,from the icons of the game to the players who ...,Nonfiction


In [8]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

train_tagged = train.apply(lambda r: TaggedDocument(words=tokenize_text(r['book_desc']), tags=[r.genres]), axis=1)

test_tagged = test.apply(lambda r: TaggedDocument(words=tokenize_text(r['book_desc']), tags=[r.genres]), axis=1)

In [9]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs=200)) for doc in sents])
    return targets, regressors

Distributed Bag of Words

In [10]:
#model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
#model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

In [11]:
#%%time
#for epoch in range(30):
 #   model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
  #  model_dbow.alpha -= 0.002
   # model_dbow.min_alpha = model_dbow.alpha

In [12]:
#y_train, X_train = vec_for_learning(model_dbow, train_tagged)
#y_test, X_test = vec_for_learning(model_dbow, test_tagged)


In [13]:

#logreg = LogisticRegression(n_jobs=1, C=1e5,  max_iter=10000))
#logreg.fit(X_train, y_train)

#y_pred = logreg.predict(X_test)

#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


In [14]:

#SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
#SVM.fit(X_train, y_train)

#y_pred = SVM.predict(X_test)

#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


In [15]:
#clf = RandomForestClassifier(bootstrap=True)

#clf = rand_search.best_estimator_

#clf = clf.fit(X_train, y_train)
#y_pred = clf.predict(X_test)
#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Distributed Memory (DM)

In [16]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=5, workers=15, alpha=0.065, min_alpha=0.065)
#model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=5, workers=15, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 25646/25646 [00:00<00:00, 2742030.65it/s]


In [17]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 25646/25646 [00:00<00:00, 7123650.36it/s]
100%|██████████| 25646/25646 [00:00<00:00, 7134044.33it/s]
100%|██████████| 25646/25646 [00:00<00:00, 7626169.47it/s]
100%|██████████| 25646/25646 [00:00<00:00, 7505904.71it/s]
100%|██████████| 25646/25646 [00:00<00:00, 7115639.37it/s]
100%|██████████| 25646/25646 [00:00<00:00, 6727991.02it/s]
100%|██████████| 25646/25646 [00:00<00:00, 7254813.54it/s]
100%|██████████| 25646/25646 [00:00<00:00, 6894002.46it/s]
100%|██████████| 25646/25646 [00:00<00:00, 6313736.01it/s]
100%|██████████| 25646/25646 [00:00<00:00, 6638716.31it/s]
100%|██████████| 25646/25646 [00:00<00:00, 6657617.16it/s]
100%|██████████| 25646/25646 [00:00<00:00, 6277992.32it/s]
100%|██████████| 25646/25646 [00:00<00:00, 6225309.36it/s]
100%|██████████| 25646/25646 [00:00<00:00, 6020098.52it/s]
100%|██████████| 25646/25646 [00:00<00:00, 5637985.24it/s]
100%|██████████| 25646/25646 [00:00<00:00, 6128133.10it/s]
100%|██████████| 25646/25646 [00:00<00:00, 5868364.45it/

CPU times: user 4min 45s, sys: 20.7 s, total: 5min 5s
Wall time: 1min 17s


In [18]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)

In [19]:

SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train, y_train)

y_pred = SVM.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.5385735080058224
Testing F1 score: 0.5439093621982938


In [20]:
#seeee ciao

#params =param_grid = {'C': [0.1, 1, 10, 100, 1000],
 #             'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              #'kernel': ['rbf']}
#SVM = svm.SVC()
#rand_search = RandomizedSearchCV(SVM, params, verbose=0, random_state=42)
#rand_search.fit(X_train, y_train)
#clf = rand_search.best_estimator_
#clf.fit(X_train, y_train)

#y_pred = SVM.predict(X_test)

#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#p#rint('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


In [21]:
logreg = LogisticRegression(n_jobs=1, C=1e5,  max_iter=10000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.5376637554585153
Testing F1 score: 0.5472231879510425


In [22]:
clf = RandomForestClassifier(bootstrap=True)

clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.4666120815138282
Testing F1 score: 0.4152750611251759


In [24]:
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras import Model
from tensorflow.keras.utils import to_categorical
from keras.regularizers import l2

ImportError: dlopen(/Users/federico/miniforge3/envs/tf/lib/python3.8/site-packages/tensorflow/python/_pywrap_tfe.so, 2): Library not loaded: @rpath/_pywrap_tensorflow_internal.so
  Referenced from: /Users/federico/miniforge3/envs/tf/lib/python3.8/site-packages/tensorflow/python/_pywrap_tfe.so
  Reason: image not found

In [23]:

#default batch size 32
#Optional Int, maximum length of all sequences. 
#If not provided, sequences will be padded to the length of the longest individual sequence.
maxlen = 100 
embed_size = 64 
#Pads sequences to the same length.
X_train_final = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test_final = pad_sequences(list_tokenized_test, maxlen=maxlen)


input = Input(shape=(maxlen, )) 
x   =  Embedding(max_features, embed_size)(input)
x   =  Dropout(0.2)(x)
x   =  Conv1D(10, 3, padding='valid',activation='relu', strides=1)(x)
x   =  GlobalMaxPooling1D()(x)
x   =  Dense(264, activation="relu", kernel_regularizer=l2(0.01), bias_regularizer=l2(0.02))(x)
x   =  Dropout(0.2)(x)
x   =  Dense(128, activation="relu", kernel_regularizer=l2(0.01), bias_regularizer=l2(0.02))(x)
x   =  Dropout(0.2)(x)
x   =  Dense(64, activation="relu", kernel_regularizer=l2(0.01), bias_regularizer=l2(0.02))(x)
x   =  Dropout(0.2)(x)
x   =  Dense(32, activation="softmax", kernel_regularizer=l2(0.01), bias_regularizer=l2(0.02))(x)

model = Model(inputs=input, outputs=x)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


cnn5 = model.fit(X_train_final, y_train, epochs=20,validation_split=0.2)

NameError: name 'pad_sequences' is not defined

Combined!

In [22]:
#model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
#model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [23]:
#new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [24]:
#def get_vectors(model, tagged_docs):
#    sents = tagged_docs.values
 #   targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words)) for doc in sents])
 #   return targets, regressors

In [25]:
#y_train, X_train = get_vectors(new_model, train_tagged)
#y_test, X_test = get_vectors(new_model, test_tagged)

In [26]:

#SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
#SVM.fit(X_train, y_train)

#y_pred = SVM.predict(X_test)

#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.1826783114992722
Testing F1 score: 0.13060293283431898


In [27]:

#logreg.fit(X_train, y_train)
#y_pred = logreg.predict(X_test)
#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.12527292576419213
Testing F1 score: 0.11406939588161567


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
#clf = RandomForestClassifier(bootstrap=True)

#clf = clf.fit(X_train, y_train)
#y_pred = clf.predict(X_test)
#print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
#print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.18340611353711792
Testing F1 score: 0.1421811719743764
