In [30]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import classification_report

In [31]:
df = pd.read_csv('data/books_fgo.csv')

In [32]:
top_genres = list(df.genres.value_counts()[:11].index)

In [33]:
df = df[df.genres.isin(top_genres)]

In [34]:
enc = LabelEncoder()
df['genres'] = enc.fit_transform(df.genres)

In [47]:
enc.classes_

array(['Childrens', 'Classics', 'Fantasy', 'Fiction', 'Historical',
       'Mystery', 'Nonfiction', 'Romance', 'Science Fiction',
       'Sequential Art', 'Young Adult'], dtype=object)

In [35]:
label = df.pop('genres')
train_set, test_set, train_label, test_label = train_test_split(df['book_desc'], 
                                           label, stratify=label, test_size=0.30, random_state=42)

In [36]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
xtrain = tfidf_vectorizer.fit_transform(train_set)
xtest = tfidf_vectorizer.transform(test_set)

In [39]:
lr = LogisticRegression(max_iter=5000)
clf = OneVsRestClassifier(lr)
clf.fit(xtrain, train_label)

OneVsRestClassifier(estimator=LogisticRegression(max_iter=5000))

In [49]:
train_pred = clf.predict(xtrain)
test_pred = clf.predict(xtest)

In [50]:
print(classification_report(train_label, train_pred))

              precision    recall  f1-score   support

           0       0.92      0.45      0.61       715
           1       0.89      0.50      0.64      1035
           2       0.73      0.91      0.81      4731
           3       0.63      0.88      0.73      3945
           4       0.87      0.54      0.67      1491
           5       0.89      0.68      0.77      1385
           6       0.90      0.80      0.85      1644
           7       0.79      0.82      0.81      2810
           8       0.90      0.54      0.68      1201
           9       0.99      0.45      0.62       934
          10       0.73      0.74      0.73      2390

    accuracy                           0.75     22281
   macro avg       0.84      0.66      0.72     22281
weighted avg       0.78      0.75      0.75     22281



In [51]:
print(classification_report(test_label, test_pred))

              precision    recall  f1-score   support

           0       0.86      0.27      0.42       306
           1       0.80      0.35      0.49       444
           2       0.65      0.84      0.73      2028
           3       0.50      0.81      0.62      1691
           4       0.78      0.36      0.49       639
           5       0.79      0.54      0.64       594
           6       0.84      0.68      0.75       705
           7       0.70      0.73      0.71      1204
           8       0.79      0.39      0.53       514
           9       0.94      0.29      0.44       400
          10       0.58      0.58      0.58      1025

    accuracy                           0.64      9550
   macro avg       0.75      0.53      0.58      9550
weighted avg       0.69      0.64      0.63      9550

