In [57]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.base import TransformerMixin
    
from joblib import dump

In [58]:
desc_df = pd.read_csv("book_data_cleaned.csv", encoding="L1")
desc_df.head()

Unnamed: 0.1,Unnamed: 0,book_authors,book_desc,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,pages
0,0,Suzanne Collins,Winning will make you famous. Losing means cer...,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,374
1,1,J.K. Rowling|Mary GrandPrÃ©,There is a door at the end of a silent corrido...,9780440000000.0,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...,870
2,2,Harper Lee,The unforgettable novel of a childhood in a sl...,9780060000000.0,324 pages,4.27,3745197,79450,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...,324
3,3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,Â«Ã cosa ormai risaputa che a uno scapolo in ...,9780680000000.0,279 pages,4.25,2453620,54322,Pride and Prejudice,Classics|Fiction|Romance,https://images.gr-assets.com/books/1320399351l...,279
4,4,Stephenie Meyer,About three things I was absolutely positive.F...,9780320000000.0,498 pages,3.58,4281268,97991,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...,498


In [59]:
desc_df.dtypes

Unnamed: 0             int64
book_authors          object
book_desc             object
book_isbn             object
book_pages            object
book_rating          float64
book_rating_count      int64
book_review_count      int64
book_title            object
genres                object
image_url             object
pages                  int64
dtype: object

In [60]:
desc_df['good_books'] = np.where(desc_df['book_rating']>=4, 'good', 'bad')

In [61]:
del desc_df['Unnamed: 0']

In [62]:
desc_df.head()

Unnamed: 0,book_authors,book_desc,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,pages,good_books
0,Suzanne Collins,Winning will make you famous. Losing means cer...,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,374,good
1,J.K. Rowling|Mary GrandPrÃ©,There is a door at the end of a silent corrido...,9780440000000.0,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...,870,good
2,Harper Lee,The unforgettable novel of a childhood in a sl...,9780060000000.0,324 pages,4.27,3745197,79450,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...,324,good
3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,Â«Ã cosa ormai risaputa che a uno scapolo in ...,9780680000000.0,279 pages,4.25,2453620,54322,Pride and Prejudice,Classics|Fiction|Romance,https://images.gr-assets.com/books/1320399351l...,279,good
4,Stephenie Meyer,About three things I was absolutely positive.F...,9780320000000.0,498 pages,3.58,4281268,97991,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...,498,bad


In [63]:
desc_df.describe()

Unnamed: 0,book_rating,book_rating_count,book_review_count,pages
count,51779.0,51779.0,51779.0,51779.0
mean,4.01472,45073.58,2083.2968,337.662836
std,0.343791,216649.5,7752.234528,259.533005
min,0.0,0.0,0.0,0.0
25%,3.82,492.0,41.0,216.0
50%,4.02,3095.0,207.0,310.0
75%,4.21,13675.0,873.0,400.0
max,5.0,5588580.0,160776.0,14777.0


In [64]:
X = desc_df['book_desc'].fillna(' ').values
y = desc_df['good_books'].values

In [65]:
# v = TfidfVectorizer(decode_error='replace', encoding='utf-8')

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=345)

In [67]:
y_train

array(['good', 'bad', 'good', ..., 'bad', 'bad', 'good'], dtype=object)

In [68]:
nb = make_pipeline(
    CountVectorizer(binary=True),
    TfidfTransformer(),
#     LogisticRegression()
#     RandomForestClassifier(),
    MultinomialNB()
)

In [69]:
nb.fit(X_train, y_train);

In [70]:
y_pred = nb.predict(X_test)

In [71]:
y_pred

array(['good', 'good', 'good', ..., 'good', 'good', 'good'], dtype='<U4')

In [72]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         bad       0.69      0.22      0.33      5932
        good       0.58      0.92      0.71      7013

    accuracy                           0.60     12945
   macro avg       0.64      0.57      0.52     12945
weighted avg       0.63      0.60      0.54     12945



In [73]:
nb.predict(X)

array(['good', 'good', 'good', ..., 'good', 'good', 'good'], dtype='<U4')

In [74]:
desc_df['pred_books'] = nb.predict(X)
desc_df.head(15)

Unnamed: 0,book_authors,book_desc,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,pages,good_books,pred_books
0,Suzanne Collins,Winning will make you famous. Losing means cer...,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,Young Adult|Fiction|Science Fiction|Dystopia|F...,https://images.gr-assets.com/books/1447303603l...,374,good,good
1,J.K. Rowling|Mary GrandPrÃ©,There is a door at the end of a silent corrido...,9780440000000.0,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,Fantasy|Young Adult|Fiction,https://images.gr-assets.com/books/1255614970l...,870,good,good
2,Harper Lee,The unforgettable novel of a childhood in a sl...,9780060000000.0,324 pages,4.27,3745197,79450,To Kill a Mockingbird,Classics|Fiction|Historical|Historical Fiction...,https://images.gr-assets.com/books/1361975680l...,324,good,good
3,Jane Austen|Anna Quindlen|Mrs. Oliphant|George...,Â«Ã cosa ormai risaputa che a uno scapolo in ...,9780680000000.0,279 pages,4.25,2453620,54322,Pride and Prejudice,Classics|Fiction|Romance,https://images.gr-assets.com/books/1320399351l...,279,good,bad
4,Stephenie Meyer,About three things I was absolutely positive.F...,9780320000000.0,498 pages,3.58,4281268,97991,Twilight,Young Adult|Fantasy|Romance|Paranormal|Vampire...,https://images.gr-assets.com/books/1361039443l...,498,bad,bad
5,Markus Zusak,Trying to make sense of the horrors of World W...,9780380000000.0,552 pages,4.36,1485632,100821,The Book Thief,Historical|Historical Fiction|Fiction|Young Adult,https://images.gr-assets.com/books/1522157426l...,552,good,good
6,C.S. Lewis|Pauline Baynes,"Journeys to the end of the world, fantastic cr...",9780070000000.0,767 pages,4.25,437829,9439,The Chronicles of Narnia,Fantasy|Classics|Fiction|Young Adult|Childrens,https://images.gr-assets.com/books/1449868701l...,767,good,good
7,George Orwell,ÙØ²Ø±Ø¹Ø© Ø§ÙØ­ÙÙØ§ÙØ§Øª ÙÙ Ø±Ø§Ø¦Ø¹Ø© ...,9780450000000.0,122 pages,3.9,2235084,42156,Animal Farm,Classics|Fiction|Science Fiction|Dystopia|Fant...,https://images.gr-assets.com/books/1424037542l...,122,bad,bad
8,Margaret Mitchell,Gone with the Wind is a novel written by Marga...,9780450000000.0,1037 pages,4.29,969181,17452,Gone with the Wind,Classics|Historical|Historical Fiction|Fiction...,https://images.gr-assets.com/books/1328025229l...,1037,good,good
9,J.R.R. Tolkien,ÙØ¬Ø²Ø¡ Ø§ÙØ«Ø§ÙØ« ÙÙ ÙÙØ­ÙØ© Ø¬ÙÙ Ø...,9780350000000.0,1728 pages,4.59,99793,1652,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,Fantasy|Fiction|Classics,https://images.gr-assets.com/books/1346072396l...,1728,good,bad


In [75]:
dump(nb, "clf.joblib")

['clf.joblib']