In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import CountVectorizer

# [Tutorial](https://hackernoon.com/want-a-complete-guide-for-xgboost-model-in-python-using-scikit-learn-sc11f31bq)

## Load the Data 

In [2]:
df = pd.read_csv("csv/book_info_complete.csv")

### Keep Description and CSM_Rating

In [3]:
df = df.loc[:, ['description', "csm_rating"]]

In [4]:
def splitter(df):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=999)
    for train_index, test_index in split.split(df, df['csm_rating']):
        train_data= df.loc[train_index]
        test_data = df.loc[test_index]
    
    return train_data, test_data

In [5]:
train_data, test_data = splitter(df)

In [6]:
train_data.shape

(4652, 2)

In [7]:
test_data.shape

(1164, 2)

In [8]:
#instantiate CountVectorizer()
cv=CountVectorizer()

# this steps generates word counts for the words in your docs
word_count_vector=cv.fit_transform(train_data['description'])

In [9]:
# seperate the independent and target variable on training data
train_x = pd.DataFrame(word_count_vector.todense(), columns=cv.get_feature_names())
train_y = train_data['csm_rating']

In [10]:
# seperate the independent and target variable on testing data
test_x = test_data.drop(columns=['csm_rating'],axis=1)
test_y = test_data['csm_rating']

In [11]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.3,
                max_depth = 3, alpha = 10, n_estimators = 120, booster = "gblinear")

In [12]:
#Fit the model with the training data
xg_reg.fit(train_x, train_y)

XGBRegressor(alpha=10, base_score=0.5, booster='gblinear',
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.3, gamma=None, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.3, max_delta_step=None, max_depth=3,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=120, n_jobs=0, num_parallel_tree=None,
             objective='reg:squarederror', random_state=0, reg_alpha=10,
             reg_lambda=0, scale_pos_weight=1, subsample=None, tree_method=None,
             validate_parameters=False, verbosity=None)

In [13]:
#Predict the taget on the train data set
predict_train = xg_reg.predict(train_x)

In [14]:
mean_absolute_error(train_y, predict_train)

3.282182597540005

### Transform and Predict on the Test DF

In [15]:
word_count_vector2 = cv.transform(test_x["description"])

In [16]:
test_x = pd.DataFrame(word_count_vector2.todense(), columns=cv.get_feature_names())

In [17]:
#Predict the target on the test dataset
predict_test = xg_reg.predict(test_x)

In [18]:
mean_absolute_error(predict_test, test_y)

3.2764254173462333