In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/evan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# [Tutorial](https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/)

## Load the Data 

In [2]:
df = pd.read_csv("csv/book_info_complete.csv")

### Keep Description and CSM_Rating

In [3]:
df = df.loc[:, ['description', "csm_rating"]]

### Make the split

In [4]:
def splitter(df):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=999)
    for train_index, test_index in split.split(df, df['csm_rating']):
        train_data= df.loc[train_index]
        test_data = df.loc[test_index]
    
    return train_data, test_data

In [5]:
train_data, test_data = splitter(df)

In [6]:
train_data.shape

(4652, 2)

In [7]:
test_data.shape

(1164, 2)

## Instantiate BOW

In [8]:
# Remove stop words and only keep tokens with count of 2
cv=CountVectorizer(stop_words=stop, min_df=2)

In [9]:
# seperate the independent and target variable on training data
train_x = pd.DataFrame(cv.fit_transform(train_data['description']).todense(), columns=cv.get_feature_names())
train_y = train_data['csm_rating']

In [10]:
test_x = pd.DataFrame(cv.transform(test_data['description']).todense(), columns=cv.get_feature_names())
test_y = test_data['csm_rating']

In [11]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.03,
                max_depth = 100, alpha = 1, n_estimators = 500, booster = "gblinear")

In [12]:
#Fit the model with the training data
xg_reg.fit(train_x, train_y)

XGBRegressor(alpha=1, base_score=0.5, booster='gblinear',
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.3, gamma=None, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.03, max_delta_step=None, max_depth=100,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=500, n_jobs=0, num_parallel_tree=None,
             objective='reg:squarederror', random_state=0, reg_alpha=1,
             reg_lambda=0, scale_pos_weight=1, subsample=None, tree_method=None,
             validate_parameters=False, verbosity=None)

In [13]:
#Predict the taget on the train data set
predict_train = xg_reg.predict(train_x)
predict_test = xg_reg.predict(test_x)

In [14]:
mean_absolute_error(train_y, predict_train)

3.282182013281234

In [15]:
mean_absolute_error(test_y, predict_test)

3.276424827444594

### Predict on the Test DF

In [16]:
#Predict the target on the test dataset
predict_test = xg_reg.predict(test_x)

In [17]:
mean_absolute_error(predict_test, test_y)

3.276424827444594

In [18]:
test_errors = abs(predict_test - test_y)

In [19]:
predictions  = pd.Series(predict_test, index=test_data.index, name="predictions")
test_errors.name = "difference"
test_results = pd.concat([test_data, test_errors, predictions], axis=1)

In [20]:
test_results.head()

Unnamed: 0,description,csm_rating,difference,predictions
1291,Thoughtful sci-fi about the price of beauty.,11,1.838365,9.161635
832,"Appealing tale of would-be witch has laughs, i...",8,1.161635,9.161635
685,"Third Bloodlines adventure takes a romantic, m...",13,3.838365,9.161635
5101,Teen grapples with grief and first love in cha...,12,2.838365,9.161635
2617,Another sweet animal tale from Babe author.,7,2.161635,9.161635


In [21]:
test_results.predictions.unique()

array([9.1616354])

## [TFIDF](https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/)

In [22]:
# create the transform
vectorizer = TfidfVectorizer(stop_words=stop, min_df=2)

In [23]:
# seperate the independent and target variable on training data
train_x_idf = pd.DataFrame(vectorizer.fit_transform(train_data["description"]).todense(), columns=vectorizer.get_feature_names())
train_y_idf = train_data['csm_rating']

In [24]:
test_x_idf = pd.DataFrame(vectorizer.transform(test_data['description']).todense(), columns=cv.get_feature_names())
test_y_idf = test_data['csm_rating']

In [25]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.3,
                max_depth = 100, alpha = 1, n_estimators = 500, booster = "gblinear")

#Fit the model with the training data
xg_reg.fit(train_x_idf, train_y_idf)

XGBRegressor(alpha=1, base_score=0.5, booster='gblinear',
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.3, gamma=None, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.3, max_delta_step=None, max_depth=100,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=500, n_jobs=0, num_parallel_tree=None,
             objective='reg:squarederror', random_state=0, reg_alpha=1,
             reg_lambda=0, scale_pos_weight=1, subsample=None, tree_method=None,
             validate_parameters=False, verbosity=None)

In [26]:
#Predict the taget on the train data set
predict_train_idf = xg_reg.predict(train_x_idf)
predict_test_idf = xg_reg.predict(test_x_idf)

In [27]:
mean_absolute_error(train_y_idf, predict_train_idf)

3.282182597540005

In [28]:
mean_absolute_error(test_y_idf, predict_test_idf)

3.2764254173462333

In [29]:
test_errors_idf = abs(predict_test_idf - test_y_idf)
predictions  = pd.Series(predict_test_idf, index=test_data.index, name="predictions")
test_errors.name = "difference"
test_results_idf = pd.concat([test_data, test_errors_idf, predictions], axis=1)

In [30]:
test_results_idf.predictions.unique()

array([9.1616497])