In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.base import TransformerMixin
    
from joblib import dump

In [2]:
desc_df = pd.read_csv("book_data_cleaned.csv", encoding="L1")
desc_df.head()

Unnamed: 0.1,Unnamed: 0,book_authors,book_desc,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,pages
0,0,Suzanne Collins,Winning will make you famous. Losing means cer...,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,374
1,1,J.K. Rowling|Mary GrandPrÃ©,There is a door at the end of a silent corrido...,9780440000000.0,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...,870
2,2,Harper Lee,The unforgettable novel of a childhood in a sl...,9780060000000.0,324 pages,4.27,3745197,79450,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...,324
3,3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,Â«Ã cosa ormai risaputa che a uno scapolo in ...,9780680000000.0,279 pages,4.25,2453620,54322,Pride and Prejudice,Classics|Fiction|Romance,https://images.gr-assets.com/books/1320399351l...,279
4,4,Stephenie Meyer,About three things I was absolutely positive.F...,9780320000000.0,498 pages,3.58,4281268,97991,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...,498


In [3]:
desc_df['good_books'] = np.where(desc_df['book_rating']>=4, 'good', 'bad')

In [4]:
del desc_df['Unnamed: 0']

In [5]:
desc_df.head()

Unnamed: 0,book_authors,book_desc,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,pages,good_books
0,Suzanne Collins,Winning will make you famous. Losing means cer...,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,374,good
1,J.K. Rowling|Mary GrandPrÃ©,There is a door at the end of a silent corrido...,9780440000000.0,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...,870,good
2,Harper Lee,The unforgettable novel of a childhood in a sl...,9780060000000.0,324 pages,4.27,3745197,79450,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...,324,good
3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,Â«Ã cosa ormai risaputa che a uno scapolo in ...,9780680000000.0,279 pages,4.25,2453620,54322,Pride and Prejudice,Classics|Fiction|Romance,https://images.gr-assets.com/books/1320399351l...,279,good
4,Stephenie Meyer,About three things I was absolutely positive.F...,9780320000000.0,498 pages,3.58,4281268,97991,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...,498,bad


In [6]:
desc_df.describe()

Unnamed: 0,book_rating,book_rating_count,book_review_count,pages
count,51779.0,51779.0,51779.0,51779.0
mean,4.01472,45073.58,2083.2968,337.662836
std,0.343791,216649.5,7752.234528,259.533005
min,0.0,0.0,0.0,0.0
25%,3.82,492.0,41.0,216.0
50%,4.02,3095.0,207.0,310.0
75%,4.21,13675.0,873.0,400.0
max,5.0,5588580.0,160776.0,14777.0


In [7]:
X = desc_df['book_desc'].fillna(' ').values
y = desc_df['good_books'].values

In [None]:
# v = TfidfVectorizer(decode_error='replace', encoding='utf-8')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=345)

In [9]:
y_train

array(['good', 'bad', 'good', ..., 'bad', 'bad', 'good'], dtype=object)

In [15]:
nb = make_pipeline(
    CountVectorizer(binary=True),
    TfidfTransformer(),
#     LogisticRegression()
#     RandomForestClassifier(),
    MultinomialNB()
)

In [16]:
# desc_df.dropna()

In [17]:
# desc_df.fillna(' ')

In [18]:
nb.fit(X_train, y_train);

In [19]:
y_pred = nb.predict(X_test)

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         bad       0.69      0.22      0.33      5932
        good       0.58      0.92      0.71      7013

    accuracy                           0.60     12945
   macro avg       0.64      0.57      0.52     12945
weighted avg       0.63      0.60      0.54     12945



In [100]:
nb.fit(X_train, y_train);

In [None]:
lr_w = make_pipeline(
    CountVectorizer(),
    LogisticRegression()
)

In [None]:
lr_w.fit(X_train, y_train);

In [None]:
y_pred = lr_w.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
lr_c = make_pipeline(
    CountVectorizer(
        analyzer="char",
        ngram_range=(3, 7)
    ),
    TfidfTransformer(),
    LogisticRegression()
)

In [None]:
lr_c.fit(X_train, y_train);

In [None]:
y_pred = lr_c.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
rf = make_pipeline(
    CountVectorizer(),
    RandomForestClassifier()
)

In [None]:
rf.fit(X_train, y_train);

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
nb.fit(X, y)

In [None]:
dump(nb, "clf.joblib")