#### Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from time import time

#### Load the data

In [2]:
train_data = pd.read_csv('train_data.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79416 entries, 0 to 79415
Columns: 884 entries, price_range to paint_color_yellow
dtypes: float64(883), int64(1)
memory usage: 535.6 MB


In [3]:
train_data

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79411,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79412,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79413,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79414,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Separate the dependent variable from the independent variables in the whole data

In [4]:
X_train = train_data.iloc[:,1:].values
y_train = train_data.iloc[:, 0].values

#### Train the model with k-fold cross validation

In [5]:
#check the start time in order to calculate the runtime of training the model
start = time()

#Define the model
xgb_model = XGBClassifier()
#Evaluate the model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=123)
n_scores = cross_val_score(xgb_model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=1, error_score='raise')

#check the end time
end = time()
print("The runtime of the model training:", end-start)

#performance report
print('Accuracy: %.3f (%.3f)'% (np.mean(n_scores), np.std(n_scores)))











The runtime of the model training: 11667.689457416534
Accuracy: 0.680 (0.004)


#### Fit the model

In [6]:
#check the start time in order to calculate the runtime of training the model
start = time()

#fit model
xgb_model.fit(X_train, y_train)

#check the end time
end = time()
print("The runtime of the model training:", end-start)

The runtime of the model training: 998.6033983230591


#### Load the test data

In [7]:
test_data = pd.read_csv('test_data.csv')
test_data

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34031,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34032,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
34033,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34034,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Separate the dependent variable from the independent variables in the test data

In [8]:
X_test = test_data.iloc[:,1:].values
y_test = test_data.iloc[:, 0].values

#### Make predictions on the test data using the model

In [9]:
# Predictions
test_prediction = xgb_model.predict(X_test)

#### Evaluate the performance of the model by checking the precision, recall and overall accuracy

In [10]:
accuracy = accuracy_score(y_test, test_prediction)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 67.87%


In [11]:
from sklearn.metrics import classification_report

report = classification_report(y_test, test_prediction, digits=4)
print(report)

              precision    recall  f1-score   support

           0     0.8237    0.8291    0.8264      9451
           1     0.6758    0.7486    0.7103      9712
           2     0.6164    0.5323    0.5712      5672
           3     0.5800    0.5826    0.5813      3785
           4     0.5385    0.5053    0.5214      1991
           5     0.5271    0.5366    0.5318      1543
           6     0.5385    0.4902    0.5132       871
           7     0.6108    0.4832    0.5396       536
           8     0.5969    0.5302    0.5616       215
           9     0.6515    0.6515    0.6515       132
          10     0.5588    0.4043    0.4691        47
          11     0.5714    0.4528    0.5053        53
          12     0.8000    0.2500    0.3810        16
          13     0.2500    0.2500    0.2500         4
          14     0.5000    1.0000    0.6667         1
          15     0.0000    0.0000    0.0000         2
          16     0.0000    0.0000    0.0000         1
          18     0.0000    

  _warn_prf(average, modifier, msg_start, len(result))


#### Import the pandemic dataset

In [12]:
pandemic_data = pd.read_csv('pandemic_data.csv')

In [13]:
pandemic_data

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
0,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81404,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81405,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
81406,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
81407,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Separte the dependent variable from the independent variables in the pandemic data

In [14]:
X_pandemic = pandemic_data.iloc[:,1:].values
y_pandemic = pandemic_data.iloc[:, 0].values

In [15]:
X_pandemic.shape

(81409, 883)

#### Make predictions on the pandemic data using the model

In [16]:
pandemic_prediction = xgb_model.predict(X_pandemic)

#### Evaluate the performance of the model by checking the precision, recall and overall accuracy of predictions on the pandemic data

In [17]:
pandemic_accuracy = accuracy_score(y_pandemic, pandemic_prediction)
print("Pandemic Accuracy: %.2f%%" % (pandemic_accuracy * 100.0))

Pandemic Accuracy: 56.20%


In [18]:
report_pandemic = classification_report(y_pandemic, pandemic_prediction, digits=4)
print(report_pandemic)

              precision    recall  f1-score   support

           0     0.7299    0.8127    0.7691     20824
           1     0.6151    0.6879    0.6495     24377
           2     0.4817    0.4294    0.4540     13044
           3     0.4204    0.4231    0.4218      8301
           4     0.3138    0.2363    0.2696      4740
           5     0.2808    0.2879    0.2843      3456
           6     0.2722    0.1929    0.2258      2478
           7     0.2441    0.1190    0.1600      1823
           8     0.1007    0.0848    0.0921       884
           9     0.1628    0.0851    0.1118       658
          10     0.0238    0.0029    0.0052       345
          11     0.0333    0.0092    0.0144       218
          12     0.1333    0.0580    0.0808        69
          13     0.0000    0.0000    0.0000        91
          14     0.0000    0.0000    0.0000        37
          15     0.0000    0.0000    0.0000        22
          16     0.0000    0.0000    0.0000        10
          17     0.0000    

  _warn_prf(average, modifier, msg_start, len(result))


#### Get the Cohen kappa score of the model with the test data

In [19]:
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(y_test, test_prediction)

0.5930671310155209

#### Get the Cohen kappa score of the model with the pandemic data

In [20]:
cohen_kappa_score(y_pandemic, pandemic_prediction)

0.4442547741900391