In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import MinMaxScaler


import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

# [Tutorial](https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/)

## Load the Data 

In [2]:
df = pd.read_csv("csv/book_info_complete.csv")
df = df.dropna(axis=1)
df["Publication date"] = df["Publication date"].str[-4:].astype(int)
df = df.drop("Last updated", axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5816 entries, 0 to 5815
Data columns (total 9 columns):
title               5816 non-null object
description         5816 non-null object
plot                5816 non-null object
csm_review          5816 non-null object
need_to_know        5816 non-null object
csm_rating          5816 non-null int64
Genre               5816 non-null object
Book type           5816 non-null object
Publication date    5816 non-null int64
dtypes: int64(2), object(7)
memory usage: 409.1+ KB


### Lower and remove stop words

In [3]:
columns = ['title', "description", "plot", "csm_review", "need_to_know"]

df = df.applymap(lambda x: x.lower() if type(x) == str else x)

df[columns] = df[columns].applymap(lambda x: ' '.join([item for item in x.split() if item not in stop]))

In [4]:
df.head()

Unnamed: 0,title,description,plot,csm_review,need_to_know,csm_rating,Genre,Book type,Publication date
0,third twin,gripping thriller skimps character development.,"twins ava alexa ""lexi"" rios live affluent sout...","third twin interesting, compelling premise: bo...",parents need know third twin murder mystery in...,12,mystery,fiction,2015
1,small damages,luminous story pregnant teen's summer spain.,"summer 1996, 18-year-old kenzie planned spend ...","could well minefield clichés nd preachiness, l...",parents need know small damages narrated pregn...,14,coming of age,fiction,2012
2,"school good evil, book 1",fractured fairy tale plenty twists fantasy fans.,best friends sophie agatha stolen away village...,school good evil run-of-the-mill fairy tale sp...,parents need know school good evil fresh take ...,8,fairy tale,fiction,2013
3,"agent chaos: x-files origins, book 1","series pictures mulder teen, captures essence ...","set 1979, agent chaos follows 17-year-old fox ...",popular tv characters always make smooth trans...,parents need know agent chaos: x-files origins...,13,science fiction,fiction,2017
4,crossing ebenezer creek,heartbreaking novel follows freed slaves sherm...,crossing ebenezer creek ya novel award-winning...,"beautifully written poetically rendered, histo...","parents need know crossing ebenezer creek, ton...",13,historical fiction,fiction,2017


### Make the split

In [5]:
def splitter(df):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=999)
    for train_index, test_index in split.split(df, df['csm_rating']):
        train_data= df.loc[train_index]
        test_data = df.loc[test_index]
    
    
    return train_data, test_data

In [6]:
train_data, test_data = splitter(df)

In [7]:
train_data.shape

(4652, 9)

In [8]:
test_data.shape

(1164, 9)

## One Hot Encode

### Book Type

In [9]:
train_data["Book type"].value_counts()

fiction        4238
non-fiction     414
Name: Book type, dtype: int64

In [10]:
book_type_cat = train_data[["Book type"]]
book_type_encoder = OneHotEncoder(handle_unknown="ignore")
book_type_cat_1hot = book_type_encoder.fit_transform(book_type_cat)

train_book_type_df = pd.DataFrame(data=book_type_cat_1hot.todense(), 
                            index= train_data.index,
                            columns=book_type_encoder.get_feature_names())

test_book_type_df = pd.DataFrame(data = book_type_encoder.transform(test_data[["Book type"]]).todense(), 
                                  index=test_data.index,
                                  columns=book_type_encoder.get_feature_names())

## Genre

In [11]:
train_data["Genre"].hist(xrot=90);

In [12]:
book_genre_cat = train_data[["Genre"]]
book_genre_encoder = OneHotEncoder(handle_unknown='ignore')
book_genre_cat_1hot = book_genre_encoder.fit_transform(book_genre_cat)

train_book_genre_df = pd.DataFrame(book_genre_cat_1hot.todense(), 
                             index=train_data.index,
                             columns=book_genre_encoder.get_feature_names())

test_book_genre_df = pd.DataFrame(data = book_genre_encoder.transform(test_data[["Genre"]]).todense(), 
                                  index=test_data.index,
                                  columns=book_genre_encoder.get_feature_names())

## MinMaxScale Publication Date

In [13]:
book_pub_year = train_data[["Publication date"]]
min_max_scaler = MinMaxScaler().fit(book_pub_year)
bp_year_MM = min_max_scaler.transform(book_pub_year)

train_bp_year_df = pd.DataFrame(data=bp_year_MM,
                          index=train_data.index, 
                          columns=["book_pub_year"])

test_bp_year_df = pd.DataFrame(data=min_max_scaler.transform(test_data[["Publication date"]]),
                              index=test_data.index,
                              columns=["book_pub_year"])

## Create BOW

In [14]:
def generate_bow(x):
    """Create a bag of words and return the vecotorizer along with data frames"""
    #instantiate CountVectorizer()
    cv=CountVectorizer(min_df=2)
    # this step generates word counts for the words in your docs
    word_count_vector=cv.fit_transform(train_data[str(x)])
    df = pd.DataFrame(word_count_vector.todense(), columns=cv.get_feature_names())
    df2 = pd.DataFrame(cv.transform(test_data[str(x)]).todense(), columns=cv.get_feature_names())

    return df, df2

In [15]:
train_description_df, test_description_df  = generate_bow('description')
train_title_df, test_title_df = generate_bow("title")
train_plot_df, test_plot_df = generate_bow("plot")
train_csm_review_df, test_csm_review_df = generate_bow("csm_review")
train_need_to_know_df, test_need_to_know_df = generate_bow("need_to_know")

## Create Create Train and Test Data Frames

In [16]:
train_x = pd.concat(objs=[train_csm_review_df, train_need_to_know_df, train_bp_year_df, train_book_genre_df,
                         train_book_type_df],
                    axis=1)

train_y = train_data['csm_rating']

In [17]:
test_x = pd.concat(objs=[test_csm_review_df, test_need_to_know_df, train_bp_year_df, train_book_genre_df,
                        test_book_type_df],
                    axis=1)

test_y = test_data['csm_rating']

In [18]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.03,
                max_depth = 100, alpha = 1, n_estimators = 500, booster = "gblinear")

In [None]:
#Fit the model with the training data
xg_reg.fit(train_x, train_y)

In [None]:
#Predict the taget on the train data set
predict_train = xg_reg.predict(train_x)
predict_test = xg_reg.predict(test_x)

In [None]:
mean_absolute_error(train_y, predict_train)

In [None]:
mean_absolute_error(test_y, predict_test)

### Transform and Predict on the Test DF

In [None]:
#Predict the target on the test dataset
predict_test = xg_reg.predict(test_x)

In [None]:
mean_absolute_error(predict_test, test_y)

In [None]:
test_errors = abs(predict_test - test_y)

In [None]:
predictions  = pd.Series(predict_test, index=test_data.index, name="predictions")

In [None]:
test_errors.name = "difference"

In [None]:
test_results = pd.concat([test_data, test_errors, predictions], axis=1)

In [None]:
test_results.head()

In [None]:
test_results.predictions.unique()