In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import classification_report

In [8]:
df = pd.read_csv('data/books_def.csv')

In [9]:
enc = LabelEncoder()
df['genres'] = enc.fit_transform(df.genres)

In [10]:
enc.classes_

array(['Classics', 'Fantasy', 'Fiction', 'Historical', 'Mystery',
       'Nonfiction', 'Romance', 'Science Fiction', 'Sequential Art',
       'Young Adult'], dtype=object)

In [11]:
label = df.pop('genres')
train_set, test_set, train_label, test_label = train_test_split(df['book_desc'], 
                                           label, stratify=label, test_size=0.30, random_state=42)

In [12]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
xtrain = tfidf_vectorizer.fit_transform(train_set)
xtest = tfidf_vectorizer.transform(test_set)

In [13]:
lr = LogisticRegression(max_iter=5000)
clf = OneVsOneClassifier(lr)
clf.fit(xtrain, train_label)

OneVsOneClassifier(estimator=LogisticRegression(max_iter=5000))

In [14]:
train_pred = clf.predict(xtrain)
test_pred = clf.predict(xtest)

In [15]:
print(classification_report(train_label, train_pred, target_names=enc.classes_))

                 precision    recall  f1-score   support

       Classics       0.88      0.39      0.54      1111
        Fantasy       0.75      0.85      0.80      4412
        Fiction       0.60      0.85      0.70      4980
     Historical       0.89      0.47      0.62      1501
        Mystery       0.86      0.60      0.71      1429
     Nonfiction       0.85      0.93      0.89      4553
        Romance       0.76      0.80      0.78      3177
Science Fiction       0.91      0.50      0.64      1225
 Sequential Art       0.99      0.37      0.54       931
    Young Adult       0.75      0.64      0.69      2327

       accuracy                           0.75     25646
      macro avg       0.82      0.64      0.69     25646
   weighted avg       0.78      0.75      0.74     25646



In [16]:
print(classification_report(test_label, test_pred, target_names=enc.classes_))

                 precision    recall  f1-score   support

       Classics       0.73      0.33      0.46       477
        Fantasy       0.66      0.75      0.70      1891
        Fiction       0.50      0.75      0.60      2134
     Historical       0.71      0.28      0.40       643
        Mystery       0.81      0.45      0.58       612
     Nonfiction       0.79      0.87      0.83      1951
        Romance       0.67      0.71      0.69      1362
Science Fiction       0.84      0.39      0.53       525
 Sequential Art       0.93      0.25      0.40       399
    Young Adult       0.60      0.51      0.55       998

       accuracy                           0.65     10992
      macro avg       0.72      0.53      0.57     10992
   weighted avg       0.68      0.65      0.63     10992

