In [1]:
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn import model_selection, svm
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC  

In [4]:
df = pd.read_csv("books_def.csv", index_col=0).reset_index()
df = df[(df['genres'] == 'Fiction') | (df['genres'] == 'Nonfiction')]
df = df.reset_index()
del df['index']

In [5]:
df

Unnamed: 0,book_authors,book_desc,book_rating,book_title,genres
0,Dan Brown,An ingenious code hidden in the works of Leona...,3.81,The Da Vinci Code,Fiction
1,Arthur Golden,"A literary sensation and runaway bestseller, t...",4.09,Memoirs of a Geisha,Fiction
2,Oscar Wilde|Jeffrey Eugenides,"﻿Written in his distinctively dazzling manner,...",4.06,The Picture of Dorian Gray,Fiction
3,Paulo Coelho|Alan R. Clarke|Özdemir İnce,Paulo Coelho's masterpiece tells the mystical ...,3.84,The Alchemist,Fiction
4,Kathryn Stockett,Be prepared to meet three unforgettable women:...,4.46,The Help,Fiction
...,...,...,...,...,...
13613,Siri Hustvedt,"A brilliant, provocative novel about an artist...",3.67,The Blazing World,Fiction
13614,Avi Steinberg,Avi Steinberg is stumped. After defecting from...,3.51,Running the Books: The Adventures of an Accide...,Nonfiction
13615,Howard Megdal,"In this fearless and half-crazy story, Howard ...",3.37,Taking the Field: A Fan's Quest to Run the Tea...,Nonfiction
13616,Howard Megdal,From the icons of the game to the players who ...,3.97,"The Baseball Talmud: Koufax, Greenberg, and th...",Nonfiction


In [6]:
# Clean data using  built in cleaner in gensim
df['text_clean'] = df['book_desc'].apply(lambda x: gensim.utils.simple_preprocess(x))
df

Unnamed: 0,book_authors,book_desc,book_rating,book_title,genres,text_clean
0,Dan Brown,An ingenious code hidden in the works of Leona...,3.81,The Da Vinci Code,Fiction,"[an, ingenious, code, hidden, in, the, works, ..."
1,Arthur Golden,"A literary sensation and runaway bestseller, t...",4.09,Memoirs of a Geisha,Fiction,"[literary, sensation, and, runaway, bestseller..."
2,Oscar Wilde|Jeffrey Eugenides,"﻿Written in his distinctively dazzling manner,...",4.06,The Picture of Dorian Gray,Fiction,"[written, in, his, distinctively, dazzling, ma..."
3,Paulo Coelho|Alan R. Clarke|Özdemir İnce,Paulo Coelho's masterpiece tells the mystical ...,3.84,The Alchemist,Fiction,"[paulo, coelho, masterpiece, tells, the, mysti..."
4,Kathryn Stockett,Be prepared to meet three unforgettable women:...,4.46,The Help,Fiction,"[be, prepared, to, meet, three, unforgettable,..."
...,...,...,...,...,...,...
13613,Siri Hustvedt,"A brilliant, provocative novel about an artist...",3.67,The Blazing World,Fiction,"[brilliant, provocative, novel, about, an, art..."
13614,Avi Steinberg,Avi Steinberg is stumped. After defecting from...,3.51,Running the Books: The Adventures of an Accide...,Nonfiction,"[avi, steinberg, is, stumped, after, defecting..."
13615,Howard Megdal,"In this fearless and half-crazy story, Howard ...",3.37,Taking the Field: A Fan's Quest to Run the Tea...,Nonfiction,"[in, this, fearless, and, half, crazy, story, ..."
13616,Howard Megdal,From the icons of the game to the players who ...,3.97,"The Baseball Talmud: Koufax, Greenberg, and th...",Nonfiction,"[from, the, icons, of, the, game, to, the, pla..."


In [7]:
i = 0
g_dict = {}
for g in df.genres.unique():
    g_dict[g] = i
    i=i+1
    

In [8]:
# Encoding the label column
df['genres']=df['genres'].map(g_dict)
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (df['text_clean'], df['genres'] , test_size=0.3)

In [9]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=300,
                                   window=10,
                                   min_count=2)

In [10]:
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [11]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [12]:
# Instantiate and fit a basic Random Forest model on top of the vectors
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())
y_pred = rf_model.predict(X_test_vect_avg)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.8225648556045032
Testing F1 score: 0.8221651129846815


In [13]:

logreg = LogisticRegression(n_jobs=1, C=1e5, max_iter=1000000)
logreg.fit(X_train_vect_avg, y_train.values.ravel())

y_pred = logreg.predict(X_test_vect_avg)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.8749388154674498
Testing F1 score: 0.8749205520045532


STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:

SVM = svm.SVC()
SVM.fit(X_train_vect_avg, y_train.values.ravel())

y_pred = SVM.predict(X_test_vect_avg)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.8414096916299559
Testing F1 score: 0.8413306055353243
