## 19_Model_XGBoost_Scraped_Features

Author: Daniel Hui

License: MIT

This model uses the core book features, and adds in additional features from the webscraping of the online card catalog to create an XGBoost model

In [1]:
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

### Global Variables
These names correspond to different data sets and targets. Comment them in/out to test different target ranges

In [2]:
target_range = "18_Half"

### Load Main Dataset

In [3]:
checkout_target_df = pd.read_csv(f'../01_Data/07_Merged_Target_Feature_Data/{target_range}_set.csv',index_col=0)

In [4]:
checkout_target_df.head(3)

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,30 Days,90 Days,180 Days,365 Days,Branches,Copies,bal,bea,bro,cap,cen,col,dlr,dth,fre,glk,gwd,hip,idc,lcy,mag,mgm,mon,net,nga,nhy,qna,rbe,spa,swt,uni,wal,wts
0,3177276,0,1,0,2016,0,0,1,0,0,0,0,0,0,0,0,14,45,78,94,17,18,0,1,1,0,0,0,1,1,1,0,1,1,0,1,1,0,1,1,1,0,1,0,1,1,1,0,1
1,395432,0,1,1,1985,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,123754,1,1,0,1945,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
checkout_target_df.describe()

Unnamed: 0,BibNum,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,30 Days,90 Days,180 Days,365 Days,Branches,Copies,bal,bea,bro,cap,cen,col,dlr,dth,fre,glk,gwd,hip,idc,lcy,mag,mgm,mon,net,nga,nhy,qna,rbe,spa,swt,uni,wal,wts
count,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0,397147.0
mean,2234258.6919,0.4945,0.9982,0.8704,1983.6751,0.2787,0.0825,0.633,0.0375,0.0212,0.0386,0.1546,0.0472,0.024,0.0191,0.032,0.7237,2.3485,4.8724,9.6092,2.4991,2.8845,0.1122,0.0724,0.0847,0.0708,0.896,0.0652,0.0327,0.095,0.0347,0.0525,0.1073,0.0404,0.0234,0.1006,0.0491,0.02,0.0331,0.1286,0.0692,0.0249,0.0464,0.0653,0.0317,0.1011,0.0575,0.015,0.0691
std,953967.649,0.5,0.0423,0.3359,189.6728,0.4484,0.2751,0.482,0.1899,0.1439,0.1927,0.3616,0.2121,0.1531,0.1368,0.176,5.128,12.2969,20.5221,34.4055,3.0947,4.7797,0.3156,0.2591,0.2785,0.2565,0.3053,0.2469,0.1778,0.2933,0.183,0.2231,0.3095,0.197,0.151,0.3008,0.2161,0.1401,0.1789,0.3348,0.2537,0.156,0.2104,0.2471,0.1752,0.3015,0.2327,0.1214,0.2537
min,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1848333.5,0.0,1.0,1.0,1996.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2554562.0,0.0,1.0,1.0,2007.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2957797.5,1.0,1.0,1.0,2013.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,7.0,3.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3343666.0,1.0,1.0,1.0,2022.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,585.0,1436.0,1613.0,2621.0,27.0,290.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Keep only the features connected to the book itself, and remove collection and Seattle features

In [6]:
checkout_target_df = checkout_target_df[["BibNum","Checkout","Title","ISBN","Year","Fiction",
                                         "Language","Nonfiction","Biography","Large Print","Picture",
                                         "Children","Teen","Mystery","AfAm","Comic"]]

In [7]:
checkout_target_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397147 entries, 0 to 397146
Data columns (total 16 columns):
BibNum         397147 non-null int64
Checkout       397147 non-null int64
Title          397147 non-null int64
ISBN           397147 non-null int64
Year           397147 non-null int64
Fiction        397147 non-null int64
Language       397147 non-null int64
Nonfiction     397147 non-null int64
Biography      397147 non-null int64
Large Print    397147 non-null int64
Picture        397147 non-null int64
Children       397147 non-null int64
Teen           397147 non-null int64
Mystery        397147 non-null int64
AfAm           397147 non-null int64
Comic          397147 non-null int64
dtypes: int64(16)
memory usage: 51.5 MB


In [8]:
len(checkout_target_df)

397147

### Load Inventory Data Set
I need this for the ISBNs only. And only in the locations I am interested in

In [9]:
isbn_df = pd.read_csv('../01_Data/03_Cleaned/Library_Collection_Inventory_jan_2018_clean.csv',index_col=0)

In [10]:
isbn_df = isbn_df[["BibNum","ISBN"]].drop_duplicates()                 #keep only columns I want
isbn_df.head()

Unnamed: 0,BibNum,ISBN
0,3177276,
1,395432,812056744.0
2,123754,
3,193328,
4,1764894,573696306.0


In [11]:
isbn_df.describe()

Unnamed: 0,BibNum
count,397147.0
mean,2234258.6919
std,953967.649
min,7.0
25%,1848333.5
50%,2554562.0
75%,2957797.5
max,3343666.0


In [12]:
len(isbn_df)   #We're looking at 30,928 Books

397147

### Load Scraped Book Data

In [13]:
scrape_df = pd.read_csv('../01_Data/03_Cleaned/Clean_Book_Data_Random.csv',index_col=0)

In [14]:
scrape_df.head()

Unnamed: 0,ISBN,page,dim,avg_rating,tot_ratings,tot_reviews
0,1620401371,0,0,3.7,4.0,1.0
1,985673486,0,0,0.0,0.0,0.0
2,1618101110,0,0,0.0,0.0,0.0
3,375864326,0,0,7.8,16.0,0.0
4,792271351,0,0,0.0,0.0,0.0


Make total reviews and total ratings integers

In [15]:
scrape_df["tot_ratings"] = scrape_df["tot_ratings"].astype(int)
scrape_df["tot_reviews"] = scrape_df["tot_reviews"].astype(int)
scrape_df = scrape_df.rename({"ISBN":"isbn"},axis=1)
scrape_df.head()

Unnamed: 0,isbn,page,dim,avg_rating,tot_ratings,tot_reviews
0,1620401371,0,0,3.7,4,1
1,985673486,0,0,0.0,0,0
2,1618101110,0,0,0.0,0,0
3,375864326,0,0,7.8,16,0
4,792271351,0,0,0.0,0,0


In [16]:
scrape_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3103 entries, 0 to 3102
Data columns (total 6 columns):
isbn           3103 non-null object
page           3103 non-null int64
dim            3103 non-null int64
avg_rating     3103 non-null float64
tot_ratings    3103 non-null int64
tot_reviews    3103 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 169.7+ KB


In [17]:
scrape_df.describe()

Unnamed: 0,page,dim,avg_rating,tot_ratings,tot_reviews
count,3103.0,3103.0,3103.0,3103.0,3103.0
mean,0.0,0.0,4.2853,16.6784,1.572
std,0.0,0.0,3.8711,93.8477,9.1776
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,6.0,1.0,0.0
75%,0.0,0.0,7.9,8.0,1.0
max,0.0,0.0,10.0,3552.0,351.0


### Merge 1: Main Set + ISBNS

In [18]:
merge_df = isbn_df.merge(checkout_target_df,on="BibNum",how="left")    #Merge from the ISBN list so it's just
merge_df = merge_df.rename({"ISBN_y":"ISBN","ISBN_x":"isbn"},axis=1)   #those branch locations
merge_df.head()

Unnamed: 0,BibNum,isbn,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic
0,3177276,,0,1,0,2016,0,0,1,0,0,0,0,0,0,0,0
1,395432,812056744.0,0,1,1,1985,0,0,1,0,0,0,0,0,0,0,0
2,123754,,1,1,0,1945,1,0,0,0,0,0,0,0,0,0,0
3,193328,,1,1,0,1962,0,0,1,0,0,0,0,0,0,0,0
4,1764894,573696306.0,1,1,1,1997,0,0,1,0,0,0,0,0,0,0,0


In [19]:
len(merge_df)   #Should be 30,928

397147

### Merge 2: + Scraped Data

In [20]:
merge_df = merge_df.merge(scrape_df,on="isbn",how="left")
merge_df.head(5)

Unnamed: 0,BibNum,isbn,Checkout,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,page,dim,avg_rating,tot_ratings,tot_reviews
0,3177276,,0,1,0,2016,0,0,1,0,0,0,0,0,0,0,0,,,,,
1,395432,812056744.0,0,1,1,1985,0,0,1,0,0,0,0,0,0,0,0,,,,,
2,123754,,1,1,0,1945,1,0,0,0,0,0,0,0,0,0,0,,,,,
3,193328,,1,1,0,1962,0,0,1,0,0,0,0,0,0,0,0,,,,,
4,1764894,573696306.0,1,1,1,1997,0,0,1,0,0,0,0,0,0,0,0,,,,,


In [21]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 397147 entries, 0 to 397146
Data columns (total 22 columns):
BibNum         397147 non-null int64
isbn           345671 non-null object
Checkout       397147 non-null int64
Title          397147 non-null int64
ISBN           397147 non-null int64
Year           397147 non-null int64
Fiction        397147 non-null int64
Language       397147 non-null int64
Nonfiction     397147 non-null int64
Biography      397147 non-null int64
Large Print    397147 non-null int64
Picture        397147 non-null int64
Children       397147 non-null int64
Teen           397147 non-null int64
Mystery        397147 non-null int64
AfAm           397147 non-null int64
Comic          397147 non-null int64
page           3115 non-null float64
dim            3115 non-null float64
avg_rating     3115 non-null float64
tot_ratings    3115 non-null float64
tot_reviews    3115 non-null float64
dtypes: float64(5), int64(16), object(1)
memory usage: 69.7+ MB


In [22]:
merge_df = merge_df[merge_df["dim"].notnull()]   #remove the data that is not the boosk I
                                                 #scraped

### Train / Test / Split

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [24]:
y = merge_df["Checkout"]
X = merge_df.drop(["Checkout","BibNum","isbn"],axis=1)

#hold out portion of the data for final testing
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=20) #keep at 20 to be consistent

#hold out 20% for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=50) 

In [26]:
print(len(X_train),len(X_val),len(X_test))

1869 623 623


In [27]:
X_train.head()

Unnamed: 0,Title,ISBN,Year,Fiction,Language,Nonfiction,Biography,Large Print,Picture,Children,Teen,Mystery,AfAm,Comic,page,dim,avg_rating,tot_ratings,tot_reviews
30945,1,1,2017,1,0,0,0,0,0,0,1,0,0,1,0.0,0.0,1.0,1.0,0.0
304516,1,1,1988,0,0,1,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0
148562,1,1,2012,0,1,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0
271223,1,1,2005,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,10.0,1.0,0.0
259647,1,1,2006,0,0,1,1,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0


### Logistic Regression

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [29]:
std_scale = StandardScaler()
X_train_scaled = std_scale.fit_transform(X_train)
lr_model = LogisticRegression(C=10000)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [30]:
lr_model.fit(X_train_scaled,y_train)

LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [31]:
y_train_predict = lr_model.predict(X_train_scaled)
print("Train Accuracy: ", accuracy_score(y_train, y_train_predict ))
print("Train Recall: ",recall_score(y_train, y_train_predict))         # Recall
print("Train Precision: ",precision_score(y_train, y_train_predict))   # Precision
print("Train f1: ",f1_score(y_train, y_train_predict)) 

Train Accuracy:  0.7169609416800428
Train Recall:  0.6350832266325224
Train Precision:  0.6702702702702703
Train f1:  0.6522024983563445


In [32]:
X_val_scaled = std_scale.transform(X_val)
y_val_pred = lr_model.predict(X_val_scaled)
print("Accuracy: ",metrics.accuracy_score(y_val, y_val_pred))
print("Precision: ",metrics.precision_score(y_val, y_val_pred))
print("Recall: ",metrics.recall_score(y_val, y_val_pred))
print("F1: ",metrics.f1_score(y_val, y_val_pred))

Accuracy:  0.7367576243980738
Precision:  0.7416666666666667
Recall:  0.6357142857142857
F1:  0.6846153846153846


  """Entry point for launching an IPython kernel.


In [33]:
confusion_matrix(y_val, y_val_pred)

array([[281,  62],
       [102, 178]])

Logistic vs XGBoost Conclusion. The Performance of XGBoost was notably better

### XGBoost

In [34]:
import xgboost as xgb

In [35]:
gbm = xgb.XGBClassifier( 
                        n_estimators=30000,
                        max_depth=12,
                        objective='binary:logistic', #new objective
                        learning_rate=.05, 
                        subsample=.8,
                        min_child_weight=3,
                        colsample_bytree=.8
                       )

eval_set=[(X_train,y_train),(X_val,y_val)]
fit_model = gbm.fit( 
                    X_train, y_train, 
                    eval_set=eval_set,
                    eval_metric='error', #new evaluation metric: classification error (could also use AUC, e.g.)
                    early_stopping_rounds=50,
                    verbose=False
                   )

In [36]:
y_train_predict = fit_model.predict(X_train)
print("Train Accuracy: ", accuracy_score(y_train, y_train_predict ))
print("Train Recall: ",recall_score(y_train, y_train_predict))         # Recall
print("Train Precision: ",precision_score(y_train, y_train_predict))   # Precision
print("Train f1: ",f1_score(y_train, y_train_predict)) 

Train Accuracy:  0.7763509898341359
Train Recall:  0.7157490396927016
Train Precision:  0.7403973509933774
Train f1:  0.7278645833333334


In [37]:
y_val_predict = fit_model.predict(X_val)
print("Validation Accuracy: ", accuracy_score(y_val, y_val_predict ))
print("Validation Recall: ",recall_score(y_val, y_val_predict))         # Recall
print("Validation Precision: ",precision_score(y_val, y_val_predict))   # Precision
print("Validation f1: ",f1_score(y_val, y_val_predict)) 

Validation Accuracy:  0.7897271268057785
Validation Recall:  0.7
Validation Precision:  0.8065843621399177
Validation f1:  0.7495219885277246


In [52]:
confusion_matrix(y_val, y_val_predict)

array([[5191,   45],
       [ 367,   73]])