## Tutorial notebook based on the Kaggle AudioBook App Data Competition, as found in https://www.kaggle.com/datasets/faressayah/audiobook-app-data

### We solve an UNBALANCED BINARY PREDICTION PROBLEM ON TABULAR DATA via a gradient-boosted decision trees approach using XGBoost.

### This approach can be carried over and applied to a variety of prediction problems on unbalanced tabular data.

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# data previously downloaded from https://www.kaggle.com/datasets/faressayah/audiobook-app-data/download
df = pd.read_csv("audiobook_data_2.csv")
df

Unnamed: 0.1,Unnamed: 0,Book_length(mins)_overall,Book_length(mins)_avg,Price_overall,Price_avg,Review,Review10/10,Completion,Minutes_listened,Support_Request,Last_Visited_mins_Purchase_date,Target
0,994,1620.0,1620,19.73,19.73,1,10.00,0.99,1603.8,5,92,0
1,1143,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,0,0
2,2059,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,388,0
3,2882,1620.0,1620,5.96,5.96,0,8.91,0.42,680.4,1,129,0
4,3342,2160.0,2160,5.33,5.33,0,8.91,0.22,475.2,0,361,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14079,28220,1620.0,1620,5.33,5.33,1,9.00,0.61,988.2,0,4,0
14080,28671,1080.0,1080,6.55,6.55,1,6.00,0.29,313.2,0,29,0
14081,31134,2160.0,2160,6.14,6.14,0,8.91,0.00,0.0,0,0,0
14082,32832,1620.0,1620,5.33,5.33,1,8.00,0.38,615.6,0,90,0


In [4]:
# remove header row
df = pd.read_csv("audiobook_data_2.csv", header=None, skiprows=[0])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,994,1620.0,1620,19.73,19.73,1,10.00,0.99,1603.8,5,92,0
1,1143,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,0,0
2,2059,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,388,0
3,2882,1620.0,1620,5.96,5.96,0,8.91,0.42,680.4,1,129,0
4,3342,2160.0,2160,5.33,5.33,0,8.91,0.22,475.2,0,361,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14079,28220,1620.0,1620,5.33,5.33,1,9.00,0.61,988.2,0,4,0
14080,28671,1080.0,1080,6.55,6.55,1,6.00,0.29,313.2,0,29,0
14081,31134,2160.0,2160,6.14,6.14,0,8.91,0.00,0.0,0,0,0
14082,32832,1620.0,1620,5.33,5.33,1,8.00,0.38,615.6,0,90,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14084 entries, 0 to 14083
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       14084 non-null  int64  
 1   1       14084 non-null  float64
 2   2       14084 non-null  int64  
 3   3       14084 non-null  float64
 4   4       14084 non-null  float64
 5   5       14084 non-null  int64  
 6   6       14084 non-null  float64
 7   7       14084 non-null  float64
 8   8       14084 non-null  float64
 9   9       14084 non-null  int64  
 10  10      14084 non-null  int64  
 11  11      14084 non-null  int64  
dtypes: float64(6), int64(6)
memory usage: 1.3 MB


In [6]:
# drop ID column-0 and target column-11 from X
X = df.drop(columns=[0,11])
X

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,1620.0,1620,19.73,19.73,1,10.00,0.99,1603.8,5,92
1,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,0
2,2160.0,2160,5.33,5.33,0,8.91,0.00,0.0,0,388
3,1620.0,1620,5.96,5.96,0,8.91,0.42,680.4,1,129
4,2160.0,2160,5.33,5.33,0,8.91,0.22,475.2,0,361
...,...,...,...,...,...,...,...,...,...,...
14079,1620.0,1620,5.33,5.33,1,9.00,0.61,988.2,0,4
14080,1080.0,1080,6.55,6.55,1,6.00,0.29,313.2,0,29
14081,2160.0,2160,6.14,6.14,0,8.91,0.00,0.0,0,0
14082,1620.0,1620,5.33,5.33,1,8.00,0.38,615.6,0,90


In [7]:
y = df[11]
y

0        0
1        0
2        0
3        0
4        0
        ..
14079    0
14080    0
14081    0
14082    0
14083    1
Name: 11, Length: 14084, dtype: int64

In [8]:
X_array = X.to_numpy()
y_array = y.to_numpy()

In [9]:
# shuffle data

shuffled_indices = np.arange(X_array.shape[0])
np.random.shuffle(shuffled_indices)

X_shuffled = X_array[shuffled_indices]
y_shuffled = y_array[shuffled_indices]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_shuffled, y_shuffled,
                                                    test_size=0.3,
                                                    random_state=42,
                                                    stratify=y_shuffled)

In [11]:
# even though trees approach does not require standardizing data, we will do it anyway

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [12]:
X_train_std.shape, X_test_std.shape, y_train.shape, y_test.shape

((9858, 10), (4226, 10), (9858,), (4226,))

#### Will re-balance target train data, by upsampling the positive class

In [13]:
y_train.shape, np.unique(y_train), y_train.sum()

((9858,), array([0, 1], dtype=int64), 1566)

In [14]:
print('Number of class-0 examples before:', X_train_std[y_train == 0].shape[0])
print('Number of class-1 examples before:', X_train_std[y_train == 1].shape[0])

Number of class-0 examples before: 8292
Number of class-1 examples before: 1566


In [15]:
# data balancing on train set ONLY

from sklearn.utils import resample

X_upsampled, y_upsampled = resample(X_train_std[y_train == 1],
                                    y_train[y_train == 1],
                                    replace=True,
                                    n_samples=X_train_std[y_train == 0].shape[0],
                                    random_state=42)

print('Number of class-1 examples after:', X_upsampled.shape[0])

Number of class-1 examples after: 8292


In [16]:
y_upsampled.sum()

8292

In [17]:
X_train_std_balanced = np.vstack((X_train_std[y_train == 0], X_upsampled))

In [18]:
y_train_balanced = np.hstack((y_train[y_train == 0], y_upsampled))

In [19]:
X_train_std_balanced.shape, y_train_balanced.shape, y_train_balanced.sum()

((16584, 10), (16584,), 8292)

In [20]:
X_test_std.shape, y_test.shape, y_test.sum()

((4226, 10), (4226,), 671)

In [21]:
################

In [24]:
import xgboost as xgb
xgb.__version__

#Note: we use XGBoost version >= 1.6

'1.6.1'

In [25]:
xgb_model = xgb.XGBClassifier(n_estimators=1000,
                              learning_rate=0.055,
                              max_depth=4,
                              n_jobs=-1,
                              random_state=42,
                              use_label_encoder=False,
                              tree_method='gpu_hist',
                              gpu_id=0)

In [26]:
xgb_classifier = xgb_model.fit(X_train_std_balanced, y_train_balanced)

In [27]:
y_train_pred = xgb_classifier.predict(X_train_std_balanced)
y_test_pred = xgb_classifier.predict(X_test_std)

In [28]:
from sklearn.metrics import accuracy_score

xgb_classifier_train_acc = accuracy_score(y_train_balanced, y_train_pred)
xgb_classifier_test_acc = accuracy_score(y_test, y_test_pred)

#### Note: Accuracy is an appropriate metric for the train data, since it is now a "balanced" data set after resampling.

In [29]:
print(f'XGBoost train accuracy: {xgb_classifier_train_acc:.5f}')
print(f'XGBoost test accuracy:  {xgb_classifier_test_acc:.5f}')

XGBoost train accuracy: 0.86547
XGBoost test accuracy:  0.85045


#### Will improve search of optimal parameters via GridSearchCV, RandomizedSearchCV

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

param_grid_simple = {'n_estimators':[50,1000],
                     'learning_rate':[0.010,0.300],
                     'max_depth':[3,10],
                     'gamma':[0,5],
                     'subsample':[0.5,1.0],
                     'colsample_bytree':[0.5,1.0]}

param_grid_full = {'n_estimators':[50,100,200,400,800,1000],
                   'learning_rate':[0.010,0.020,0.040,0.080,0.100,0.200,0.300],
                   'max_depth':[3,4,5,6,7,8,9,10],
                   'gamma':[0,1,2,3,4,5],
                   'subsample':[0.5,0.6,0.7,0.8,0.9,1.0],
                   'colsample_bytree':[0.5,0.6,0.7,0.8,0.9,1.0]}

In [31]:
xgb_model_grid_simple_search = GridSearchCV(xgb_model,
                                            param_grid_simple,
                                            n_jobs=-1,
                                            cv=5,
                                            verbose=3,
                                            scoring='accuracy')

In [32]:
xgb_model_grid_full_search = GridSearchCV(xgb_model,
                                          param_grid_full,
                                          n_jobs=-1,
                                          cv=5,
                                          verbose=3,
                                          scoring='accuracy')

In [33]:
# WARNING: searching through "param_grid_full" takes a very LONG time
# xgb_model_grid_full_search.fit(X_train_std_balanced, y_train_balanced)

In [34]:
xgb_model_grid_simple_search.fit(X_train_std_balanced, y_train_balanced)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     callbacks=None, colsample_bylevel=1,
                                     colsample_bynode=1, colsample_bytree=1,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=0, gpu_id=0, grow_policy='depthwise',
                                     importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.055, max_bin=256,
                                     max_cat_to_on...
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=1000, n_jobs=-1,
                                     num_parallel_tree=1, predictor='auto',
      

In [35]:
xgb_classifier_optimized = xgb_model_grid_simple_search.best_estimator_

In [36]:
xgb_model_grid_simple_search.best_params_

{'colsample_bytree': 1.0,
 'gamma': 0,
 'learning_rate': 0.3,
 'max_depth': 10,
 'n_estimators': 1000,
 'subsample': 0.5}

In [37]:
xgb_classifier_optimized_train_acc = accuracy_score(y_train_balanced, xgb_classifier_optimized.predict(X_train_std_balanced))
xgb_classifier_optimized_test_acc  = accuracy_score(y_test, xgb_classifier_optimized.predict(X_test_std))

In [38]:
print(f'XGBoost-optimized train accuracy: {xgb_classifier_optimized_train_acc:.5f}')
print(f'XGBoost-optimized test accuracy:  {xgb_classifier_optimized_test_acc:.5f}')

XGBoost-optimized train accuracy: 0.90774
XGBoost-optimized test accuracy:  0.86204


In [39]:
# distribution of non-integer parameters for the RandomizedSearchCV to sample from

import scipy.stats

param_range_learning_rate = scipy.stats.loguniform(0.010,0.300)
param_range_gamma = scipy.stats.loguniform(0,5)
param_range_subsample = scipy.stats.loguniform(0.5,1.0)
param_range_colsample_bytree = scipy.stats.loguniform(0.5,1.0)

param_grid_distributions_list = [{'n_estimators':[50,100,200,400,800,1000]},
                                 {'learning_rate':param_range_learning_rate},
                                 {'max_depth':[3,4,5,6,7,8,9,10]},
                                 {'gamma':param_range_gamma},
                                 {'subsample':param_range_subsample},
                                 {'colsample_bytree':param_range_colsample_bytree}]

# Note: it was found to be optimal to simply perform random search over the 'param_grid_full',
#       rather than using log-distributions set up above

In [40]:
xgb_model_random_search = RandomizedSearchCV(estimator=xgb_model,
                                             param_distributions=param_grid_full,
                                             #param_distributions=param_grid_distributions_list,
                                             #n_iter=(30*2),
                                             n_iter=100,
                                             scoring='accuracy',
                                             n_jobs=-1,
                                             cv=5,
                                             verbose=3,
                                             random_state=42)

In [41]:
xgb_model_random_search.fit(X_train_std_balanced, y_train_balanced)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           callbacks=None, colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=0, gpu_id=0,
                                           grow_policy='depthwise',
                                           importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.055, max_bin=256,
                                           max_cat...
                                           predictor='auto', random_state=42,
                                           reg_alpha=0, reg

In [42]:
xgb_classifier_optimized_random = xgb_model_random_search.best_estimator_

In [43]:
xgb_model_random_search.best_params_

{'subsample': 0.5,
 'n_estimators': 400,
 'max_depth': 7,
 'learning_rate': 0.3,
 'gamma': 0,
 'colsample_bytree': 0.6}

In [44]:
xgb_classifier_optimized_random_train_acc = accuracy_score(y_train_balanced, xgb_classifier_optimized_random.predict(X_train_std_balanced))
xgb_classifier_optimized_random_test_acc  = accuracy_score(y_test, xgb_classifier_optimized_random.predict(X_test_std))

In [45]:
print(f'XGBoost-optimized-random train accuracy: {xgb_classifier_optimized_random_train_acc:.5f}')
print(f'XGBoost-optimized-random test accuracy:  {xgb_classifier_optimized_random_test_acc:.5f}')

XGBoost-optimized-random train accuracy: 0.90189
XGBoost-optimized-random test accuracy:  0.86323


In [46]:
xgb_classifier_optimized_random.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=400,
              n_jobs=-1, num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, ...)>

#### Final model verification using resampled test data set this time, to ensure model is making "real" predictions.

In [47]:
print('Number of class-0 test examples:', X_test_std[y_test == 0].shape[0])
print('Number of class-1 test examples:', X_test_std[y_test == 1].shape[0])

Number of class-0 test examples: 3555
Number of class-1 test examples: 671


#### Rebalance the test data to confirm that the accuracy metric of final model is indeed better than 50/50 guessing

In [48]:
X_test_upsampled, y_test_upsampled = resample(X_test_std[y_test == 1],
                                     y_test[y_test == 1],
                                     replace=True,
                                     n_samples=X_test_std[y_test == 0].shape[0],
                                     random_state=42)

print('Number of class-1 test examples after rebalancing:', X_test_upsampled.shape[0])

Number of class-1 test examples after rebalancing: 3555


In [49]:
y_test_upsampled.sum()

3555

In [50]:
X_test_std_balanced = np.vstack((X_test_std[y_test == 0], X_test_upsampled))
y_test_balanced = np.hstack((y_test[y_test == 0], y_test_upsampled))

In [51]:
print('Number of class-0 test-balanced examples:', X_test_std_balanced[y_test_balanced == 0].shape[0])
print('Number of class-1 test-balanced examples:', X_test_std_balanced[y_test_balanced == 1].shape[0])

Number of class-0 test-balanced examples: 3555
Number of class-1 test-balanced examples: 3555


In [52]:
X_test_std_balanced.shape, y_test_balanced.shape, y_test_balanced.sum()

((7110, 10), (7110,), 3555)

In [53]:
y_test_balanced_prediction = xgb_classifier_optimized_random.predict(X_test_std_balanced)

In [54]:
xgb_classifier_optimized_random_test_balanced_acc = accuracy_score(y_test_balanced, y_test_balanced_prediction)
print(f'XGBoost-optimized-random test-balanced accuracy: {xgb_classifier_optimized_random_test_balanced_acc:.5f}')

XGBoost-optimized-random test-balanced accuracy: 0.78284


#### Thus on a "balanced" test data set, our final model has accuracy: 78.3%.

#### Using the UNbalanced test data to compute the final model metrics

In [57]:
y_test_prediction = xgb_classifier_optimized_random.predict(X_test_std)

In [58]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_test_prediction)

array([[3202,  353],
       [ 225,  446]], dtype=int64)

In [59]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

xgb_classifier_optimized_random_test_acc=accuracy_score(y_test, y_test_prediction)
xgb_classifier_optimized_random_test_prec=precision_score(y_test, y_test_prediction)
xgb_classifier_optimized_random_test_rec=recall_score(y_test, y_test_prediction)
xgb_classifier_optimized_random_test_f1=f1_score(y_test, y_test_prediction)

print(f'XGBoost-optimized-random test accuracy: {xgb_classifier_optimized_random_test_acc:.5f}')
print(f'XGBoost-optimized-random test precision: {xgb_classifier_optimized_random_test_prec:.5f}')
print(f'XGBoost-optimized-random test recall: {xgb_classifier_optimized_random_test_rec:.5f}')
print(f'XGBoost-optimized-random test f1-score: {xgb_classifier_optimized_random_test_f1:.5f}')

XGBoost-optimized-random test accuracy: 0.86323
XGBoost-optimized-random test precision: 0.55820
XGBoost-optimized-random test recall: 0.66468
XGBoost-optimized-random test f1-score: 0.60680


### Finally, on the original test data set, our final F1-score: 0.60680

#### Save the final XGBoost model in JSON format into local drive

In [62]:
xgb_classifier_optimized_random.save_model("xgb_classifier_optimized_random.json")

In [63]:
# To reload the same model:

# xgb_classifier_2 = xgb.XGBClassifier()
# xgb_classifier_2.load_model("xgb_classifier_optimized_random.json")