In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('data/books_fgo.csv')

In [3]:
top_genres = list(df.genres.value_counts()[:11].index)

In [4]:
df = df[df.genres.isin(top_genres)]

In [34]:
enc = LabelEncoder()
df['genres'] = enc.fit_transform(df.genres)

In [47]:
enc.classes_

array(['Childrens', 'Classics', 'Fantasy', 'Fiction', 'Historical',
       'Mystery', 'Nonfiction', 'Romance', 'Science Fiction',
       'Sequential Art', 'Young Adult'], dtype=object)

In [5]:
label = df.pop('genres')
train_set, test_set, train_label, test_label = train_test_split(df['book_desc'], 
                                           label, stratify=label, test_size=0.30, random_state=42)

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
xtrain = tfidf_vectorizer.fit_transform(train_set)
xtest = tfidf_vectorizer.transform(test_set)

In [7]:
lr = LogisticRegression(max_iter=5000)
clf = OneVsRestClassifier(lr)
clf.fit(xtrain, train_label)

OneVsRestClassifier(estimator=LogisticRegression(max_iter=5000))

In [8]:
train_pred = clf.predict(xtrain)
test_pred = clf.predict(xtest)

In [9]:
print(classification_report(train_label, train_pred))

                 precision    recall  f1-score   support

      Childrens       0.91      0.61      0.73       715
       Classics       0.89      0.59      0.71      1035
        Fantasy       0.77      0.92      0.84      4731
        Fiction       0.71      0.87      0.78      3945
     Historical       0.87      0.64      0.74      1491
        Mystery       0.88      0.76      0.81      1385
     Nonfiction       0.89      0.86      0.88      1644
        Romance       0.80      0.84      0.82      2810
Science Fiction       0.89      0.66      0.76      1201
 Sequential Art       0.99      0.56      0.72       934
    Young Adult       0.75      0.77      0.76      2390

       accuracy                           0.80     22281
      macro avg       0.85      0.73      0.78     22281
   weighted avg       0.81      0.80      0.79     22281



In [10]:
print(classification_report(test_label, test_pred))

                 precision    recall  f1-score   support

      Childrens       0.79      0.38      0.52       306
       Classics       0.75      0.39      0.52       444
        Fantasy       0.68      0.84      0.76      2028
        Fiction       0.57      0.78      0.66      1691
     Historical       0.71      0.43      0.53       639
        Mystery       0.75      0.62      0.68       594
     Nonfiction       0.80      0.74      0.77       705
        Romance       0.69      0.76      0.72      1204
Science Fiction       0.79      0.48      0.60       514
 Sequential Art       0.88      0.38      0.53       400
    Young Adult       0.60      0.60      0.60      1025

       accuracy                           0.67      9550
      macro avg       0.73      0.58      0.63      9550
   weighted avg       0.69      0.67      0.66      9550

