In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/evan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# [Tutorial](https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/)

## Load the Data 

In [2]:
df = pd.read_csv("csv/book_info_complete.csv")
df.head()

Unnamed: 0,title,description,plot,csm_review,need_to_know,par_rating,kids_rating,csm_rating,Author,Genre,...,Publisher's recommended age(s),Number of pages,Available on,Last updated,Illustrator,Authors,Awards,Publishers,Award,Illustrators
0,The Third Twin,Gripping thriller skimps on character developm...,"Twins Ava and Alexa ""Lexi"" Rios live in an aff...","THE THIRD TWIN has an interesting, compelling ...",Parents need to know that The Third Twin is a ...,17.0,14.0,12,CJ Omololu,Mystery,...,12 - 18,336.0,"Nook, Hardback, iBooks, Kindle","June 19, 2019",,,,,,
1,Small Damages,Luminous story of pregnant teen's summer in Sp...,"It's the summer of 1996, which 18-year-old Ken...",This could well have been a minefield of clich...,Parents need to know that Small Damages is nar...,,14.0,14,Beth Kephart,Coming of Age,...,14 - 17,304.0,"Nook, Hardback, iBooks, Kindle","May 06, 2019",,,,,,
2,"The School for Good and Evil, Book 1",Fractured fairy tale has plenty of twists for ...,When best friends Sophie and Agatha are stolen...,The School for Good and Evil is no run-of-the-...,Parents need to know that The School for Good ...,11.0,11.0,8,Soman Chainani,Fairy Tale,...,8 - 17,496.0,"Nook, Audiobook (unabridged), Hardback, iBooks...","October 18, 2017",Iacopo Bruno,,,,,
3,"Agent of Chaos: The X-Files Origins, Book 1","Series pictures Mulder as teen, captures essen...","Set in 1979, AGENT OF CHAOS follows a 17-year-...",Popular TV characters don't always make a smoo...,Parents need to know that Agent of Chaos: The ...,,,13,Kami Garcia,Science Fiction,...,14 - 18,320.0,"Nook, Audiobook (abridged), Hardback, iBooks, ...","June 19, 2019",,,,,,
4,Crossing Ebenezer Creek,Heartbreaking novel follows freed slaves on Sh...,CROSSING EBENEZER CREEK is a YA novel from awa...,"Beautifully written and poetically rendered, t...",Parents need to know that Crossing Ebenezer Cr...,,,13,Tonya Bolden,Historical Fiction,...,,240.0,"Nook, Audiobook (unabridged), Hardback, Kindle","January 18, 2019",,,,,,


### Keep Description and CSM_Rating

In [3]:
df = df.loc[:, ['description', "csm_rating"]]

### Lower and remove stop words

In [4]:
df.loc[:, "description"] = df.loc[:, "description"].str.lower()
df['description'] = df['description'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))

### Make the split

In [5]:
def splitter(df):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=999)
    for train_index, test_index in split.split(df, df['csm_rating']):
        train_data= df.loc[train_index]
        test_data = df.loc[test_index]
    
    return train_data, test_data

In [6]:
train_data, test_data = splitter(df)

In [7]:
train_data.shape

(4652, 2)

In [8]:
test_data.shape

(1164, 2)

In [9]:
#instantiate CountVectorizer()
cv=CountVectorizer(min_df=2)

# this step generates word counts for the words in your docs
word_count_vector=cv.fit_transform(train_data['description'])

In [10]:
# seperate the independent and target variable on training data
train_x = pd.DataFrame(word_count_vector.todense(), columns=cv.get_feature_names())
train_y = train_data['csm_rating']

In [11]:
test_x = pd.DataFrame(cv.transform(test_data['description']).todense(), columns=cv.get_feature_names())
test_y = test_data['csm_rating']

In [12]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.03,
                max_depth = 100, alpha = 1, n_estimators = 500, booster = "gblinear")

In [13]:
#Fit the model with the training data
xg_reg.fit(train_x, train_y)

XGBRegressor(alpha=1, base_score=0.5, booster='gblinear',
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.3, gamma=None, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.03, max_delta_step=None, max_depth=100,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=500, n_jobs=0, num_parallel_tree=None,
             objective='reg:squarederror', random_state=0, reg_alpha=1,
             reg_lambda=0, scale_pos_weight=1, subsample=None, tree_method=None,
             validate_parameters=False, verbosity=None)

In [14]:
#Predict the taget on the train data set
predict_train = xg_reg.predict(train_x)
predict_test = xg_reg.predict(test_x)

In [15]:
mean_absolute_error(train_y, predict_train)

3.282182013281234

In [16]:
mean_absolute_error(test_y, predict_test)

3.276424827444594

### Transform and Predict on the Test DF

In [17]:
#Predict the target on the test dataset
predict_test = xg_reg.predict(test_x)

In [18]:
mean_absolute_error(predict_test, test_y)

3.276424827444594

In [19]:
test_errors = abs(predict_test - test_y)

In [20]:
predictions  = pd.Series(predict_test, index=test_data.index, name="predictions")

In [21]:
test_errors.name = "difference"

In [22]:
test_results = pd.concat([test_data, test_errors, predictions], axis=1)

In [23]:
test_results.head()

Unnamed: 0,description,csm_rating,difference,predictions
1291,thoughtful sci-fi price beauty.,11,1.838365,9.161635
832,"appealing tale would-be witch laughs, insights.",8,1.161635,9.161635
685,"third bloodlines adventure takes romantic, mag...",13,3.838365,9.161635
5101,teen grapples grief first love charming romance.,12,2.838365,9.161635
2617,another sweet animal tale babe author.,7,2.161635,9.161635


In [24]:
test_results.predictions.unique()

array([9.1616354])