In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('data/books_fgo.csv')

In [3]:
top_genres = list(df.genres.value_counts()[:11].index)

In [4]:
df = df[df.genres.isin(top_genres)]

In [34]:
enc = LabelEncoder()
df['genres'] = enc.fit_transform(df.genres)

In [47]:
enc.classes_

array(['Childrens', 'Classics', 'Fantasy', 'Fiction', 'Historical',
       'Mystery', 'Nonfiction', 'Romance', 'Science Fiction',
       'Sequential Art', 'Young Adult'], dtype=object)

In [5]:
label = df.pop('genres')
train_set, test_set, train_label, test_label = train_test_split(df['book_desc'], 
                                           label, stratify=label, test_size=0.30, random_state=42)

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
xtrain = tfidf_vectorizer.fit_transform(train_set)
xtest = tfidf_vectorizer.transform(test_set)

In [None]:
lr = LogisticRegression(max_iter=5000)
clf = OneVsRestClassifier(lr)
clf.fit(xtrain, train_label)

In [None]:
train_pred = clf.predict(xtrain)
test_pred = clf.predict(xtest)

In [None]:
print(classification_report(train_label, train_pred))

In [None]:
print(classification_report(test_label, test_pred))