#### Import the relevant libraries

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

#### Load the data

In [2]:
train_data = pd.read_csv('train_data.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79416 entries, 0 to 79415
Columns: 884 entries, price_range to paint_color_yellow
dtypes: float64(883), object(1)
memory usage: 535.6+ MB


In [3]:
train_data

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
0,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15k-20k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79411,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79412,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79413,0-5k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79414,0-5k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Take a sample of the data

In [4]:
train_data_sample = train_data.sample(frac=0.10, random_state=50)
train_data_sample.shape

(7942, 884)

In [5]:
train_data_sample

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
53645,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29058,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1888,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30044,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12676,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39447,15k-20k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
12997,0-5k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25753,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
34324,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Separate the dependent variable from the independent variables in the sample data

In [6]:
X_train_sample = train_data_sample.iloc[:,1:].values
y_train_sample = train_data_sample.iloc[:, 0].values

#### Train the model on the sample data with k-fold cross validation

In [9]:
#Define the model
xgb_model = XGBClassifier()
#Evaluate the model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=123)
n_scores = cross_val_score(xgb_model, X_train_sample, y_train_sample, scoring='accuracy', cv=cv, n_jobs=1, error_score='raise')

#performance report
print('Accuracy: %.3f (%.3f)'% (np.mean(n_scores), np.std(n_scores)))











Accuracy: 0.595 (0.012)


#### Separate the dependent variable from the independent variables in the whole data

In [10]:
X_train = train_data.iloc[:,1:].values
y_train = train_data.iloc[:, 0].values

#### Train the model on the whole testing data with k-fold cross validation

In [11]:
#Define the model
xgb_model = XGBClassifier()
#Evaluate the model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=123)
n_scores = cross_val_score(xgb_model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=1, error_score='raise')

#performance report
print('Accuracy: %.3f (%.3f)'% (np.mean(n_scores), np.std(n_scores)))











Accuracy: 0.680 (0.004)


#### Fit the model

In [12]:
#fit model
xgb_model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

#### Load the test data

In [13]:
test_data = pd.read_csv('test_data.csv')
test_data

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
0,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,25k-30k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34031,5k-10k,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34032,20k-25k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
34033,25k-30k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34034,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Separte the dependent variable from the independent variables in the test data

In [14]:
X_test = test_data.iloc[:,1:].values
y_test = test_data.iloc[:, 0].values

#### Make predictions on the test data using the model

In [19]:
# Predictions
test_prediction = xgb_model.predict(X_test)

#### Evaluate the performance of the model by checking the precision, recall and overall accuracy

In [20]:
accuracy = accuracy_score(y_test, pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 67.87%


In [21]:
from sklearn.metrics import classification_report

report = classification_report(y_test, test_prediction)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        0-5k       0.82      0.83      0.83      9451
     10k-15k       0.62      0.53      0.57      5672
   120k-125k       0.00      0.00      0.00         1
   125k-130k       0.00      0.00      0.00         1
     15k-20k       0.58      0.58      0.58      3785
     20k-25k       0.54      0.51      0.52      1991
     25k-30k       0.53      0.54      0.53      1543
     30k-35k       0.54      0.49      0.51       871
     35k-40k       0.61      0.48      0.54       536
     40k-45k       0.60      0.53      0.56       215
     45k-50k       0.65      0.65      0.65       132
     50k-55k       0.56      0.40      0.47        47
     55k-60k       0.57      0.45      0.51        53
      5k-10k       0.68      0.75      0.71      9712
     60k-65k       0.80      0.25      0.38        16
     65k-70k       0.25      0.25      0.25         4
     70k-75k       0.50      1.00      0.67         1
     75k-80k       0.00    

#### Import the pandemic dataset

In [22]:
pandemic_data = pd.read_csv('pandemic_data.csv')

In [23]:
pandemic_data

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
0,40k-45k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,15k-20k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,20k-25k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,60k-65k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81404,20k-25k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81405,15k-20k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
81406,20k-25k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
81407,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Separte the dependent variable from the independent variables in the pandemic data

In [24]:
X_pandemic = pandemic_data.iloc[:,1:].values
y_pandemic = pandemic_data.iloc[:, 0].values

In [25]:
X_pandemic.shape

(81409, 883)

#### Make predictions on the pandemic data using the model

In [27]:
pandemic_prediction = xgb_model.predict(X_pandemic)

#### Evaluate the performance of the model by checking the precision, recall and overall accuracy of predictions on the pandemic data

In [30]:
pandemic_accuracy = accuracy_score(y_pandemic, pandemic_prediction)
print("Pandemic Accuracy: %.2f%%" % (pandemic_accuracy * 100.0))

Pandemic Accuracy: 56.20%


In [31]:
report_pandemic = classification_report(y_pandemic, pandemic_prediction)
print(report_pandemic)

              precision    recall  f1-score   support

        0-5k       0.73      0.81      0.77     20824
   100k-105k       0.00      0.00      0.00         1
   105k-110k       0.00      0.00      0.00         2
     10k-15k       0.48      0.43      0.45     13044
   115k-120k       0.00      0.00      0.00         1
   120k-125k       0.00      0.00      0.00         7
   125k-130k       0.00      0.00      0.00         5
   130k-135k       0.00      0.00      0.00         2
     15k-20k       0.42      0.42      0.42      8301
     20k-25k       0.31      0.24      0.27      4740
     25k-30k       0.28      0.29      0.28      3456
     30k-35k       0.27      0.19      0.23      2478
     35k-40k       0.24      0.12      0.16      1823
     40k-45k       0.10      0.08      0.09       884
     45k-50k       0.16      0.09      0.11       658
     50k-55k       0.02      0.00      0.01       345
     55k-60k       0.03      0.01      0.01       218
      5k-10k       0.62    