In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer

# [Tutorial](https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/)

## Load the Data 

In [2]:
df = pd.read_csv("csv/book_info_complete.csv")

### Keep Description and CSM_Rating

In [3]:
df = df.loc[:, ['description', "csm_rating"]]

In [4]:
def splitter(df):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=999)
    for train_index, test_index in split.split(df, df['csm_rating']):
        train_data= df.loc[train_index]
        test_data = df.loc[test_index]
    
    return train_data, test_data

In [5]:
train_data, test_data = splitter(df)

In [6]:
train_data.shape

(4652, 2)

In [7]:
test_data.shape

(1164, 2)

In [8]:
#instantiate CountVectorizer()
cv=CountVectorizer()

# this steps generates word counts for the words in your docs
word_count_vector=cv.fit_transform(train_data['description'])

In [9]:
# seperate the independent and target variable on training data
train_x = pd.DataFrame(word_count_vector.todense(), columns=cv.get_feature_names())
train_y = train_data['csm_rating']


In [10]:
# seperate the independent and target variable on testing data
#test_x = test_data.drop(columns=['csm_rating'],axis=1)
#test_x = pd.DataFrame(cv.transform(test_x))
test_x = pd.DataFrame(cv.transform(test_data['description']).todense(), columns=cv.get_feature_names())
test_y = test_data['csm_rating']

In [11]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.03,
                max_depth = 10, alpha = 1, n_estimators = 500, booster = "gblinear")

In [12]:
#Fit the model with the training data
xg_reg.fit(train_x, train_y)
#xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=1, base_score=0.5, booster='gblinear',
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.3, gamma=None, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.03, max_delta_step=None, max_depth=3,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=500, n_jobs=0, num_parallel_tree=None,
             objective='reg:squarederror', random_state=0, reg_alpha=1,
             reg_lambda=0, scale_pos_weight=1, subsample=None, tree_method=None,
             validate_parameters=False, verbosity=None)

In [13]:
#Predict the taget on the train data set
predict_train = xg_reg.predict(train_x)
predict_test = xg_reg.predict(test_x)

#preds = xg_reg.predict(X_test)

In [14]:
mean_absolute_error(train_y, predict_train)

3.2821819954459666

In [15]:
mean_absolute_error(test_y, predict_test)

3.276424833999057

### Transform and Predict on the Test DF

In [17]:
#Predict the target on the test dataset
predict_test = xg_reg.predict(test_x)

In [18]:
mean_absolute_error(predict_test, test_y)

3.276424833999057

In [19]:
test_errors = abs(predict_test - test_y)

In [20]:
predictions  = pd.Series(predict_test, index=test_data.index, name="predictions")

In [21]:
test_errors.name = "difference"

In [22]:
test_results = pd.concat([test_data, test_errors, predictions], axis=1)

In [23]:
test_results.head()

Unnamed: 0,description,csm_rating,difference,predictions
1291,Thoughtful sci-fi about the price of beauty.,11,1.838364,9.161636
832,"Appealing tale of would-be witch has laughs, i...",8,1.161636,9.161636
685,"Third Bloodlines adventure takes a romantic, m...",13,3.838365,9.161635
5101,Teen grapples with grief and first love in cha...,12,2.838365,9.161635
2617,Another sweet animal tale from Babe author.,7,2.161635,9.161635


In [24]:
test_results.sort_values("difference", ascending=False).head(50)

Unnamed: 0,description,csm_rating,difference,predictions
1310,Dystopian thriller for adults OK for mature te...,17,7.838365,9.161635
1257,Really racy romance with explicit sex and not ...,17,7.838365,9.161635
4754,"Intense, award-winning fairy tale for mature r...",17,7.838365,9.161635
4277,One-of-a-kind story of a dream fulfilled.,2,7.161637,9.161637
2854,Good book for the end of a hectic day.,2,7.161636,9.161636
2516,"All of Beatrix Potter in one book, pre-K and K.",2,7.161636,9.161636
4525,Presents images of African village life.,2,7.161636,9.161636
1846,Kids will relate to appealing holiday tale.,2,7.161635,9.161635
1484,An amusing potty story.,2,7.161635,9.161635
774,Baby Max struggles to write -- kids will relate.,2,7.161635,9.161635


In [26]:
test_x.shape

(1164, 5618)