#### Import the relevant libraries

In [1]:
from sklearn.datasets import make_regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier

#### Load the data

In [2]:
train_data = pd.read_csv('train_data.csv', index_col=False)
train_data

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
0,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15k-20k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79411,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79412,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79413,0-5k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79414,0-5k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Take a sample of the data

In [3]:
train_data_sample = train_data.sample(frac=0.01, random_state=50)
train_data_sample.shape

(794, 884)

In [4]:
train_data_sample

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
53645,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29058,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1888,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30044,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12676,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78736,0-5k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77374,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
70285,25k-30k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8109,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


#### Separate the dependent variable from the independent variables in the sample data

In [5]:
X_train_sample = train_data_sample.iloc[:,1:].values
y_train_sample = train_data_sample.iloc[:, 0].values

#### Train the model on the sample data with k-fold cross validation

In [18]:
#Define the model
random_forest_model= RandomForestClassifier()
#Evaluate the model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=123)
n_scores = cross_val_score(random_forest_model, X_train_sample, y_train_sample, scoring='accuracy', cv=cv, n_jobs=1, error_score='raise')

#performance report
print('Accuracy: %.3f (%.3f)'% (np.mean(n_scores), np.std(n_scores)))



Accuracy: 0.433 (0.028)


#### Separate the dependent variable from the independent variables in the whole data

In [9]:
X_train = train_data.iloc[:,1:].values
y_train = train_data.iloc[:, 0].values

#### Train the model on the whole testing data with k-fold cross validation

In [10]:
#Define the model
random_forest_model= RandomForestClassifier()
#Evaluate the model
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=123)
n_scores = cross_val_score(random_forest_model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=1, error_score='raise')

#performance report
print('Accuracy: %.3f (%.3f)'% (np.mean(n_scores), np.std(n_scores)))



Accuracy: 0.751 (0.003)


#### Fit the model

In [21]:
random_forest_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

#### Load the test data

In [12]:
test_data = pd.read_csv('test_data.csv')
test_data

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
0,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,25k-30k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34031,5k-10k,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34032,20k-25k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
34033,25k-30k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34034,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Separte the dependent variable from the independent variables in the test data

In [13]:
X_test = test_data.iloc[:,1:].values
y_test = test_data.iloc[:, 0].values

#### Make predictions on the test data using the model

In [22]:
test_prediction = random_forest_model.predict(X_test)

#### Evaluate the performance of the model by checking the precision, recall and overall accuracy

In [17]:
from sklearn.metrics import classification_report

report = classification_report(y_test, test_prediction)
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        0-5k       0.84      0.86      0.85      9451
     10k-15k       0.74      0.69      0.71      5672
   120k-125k       0.00      0.00      0.00         1
   125k-130k       0.00      0.00      0.00         1
     15k-20k       0.71      0.70      0.71      3785
     20k-25k       0.72      0.64      0.67      1991
     25k-30k       0.68      0.69      0.69      1543
     30k-35k       0.72      0.69      0.70       871
     35k-40k       0.76      0.66      0.71       536
     40k-45k       0.68      0.63      0.66       215
     45k-50k       0.75      0.64      0.69       132
     50k-55k       0.68      0.49      0.57        47
     55k-60k       0.74      0.49      0.59        53
      5k-10k       0.75      0.79      0.77      9712
     60k-65k       0.80      0.25      0.38        16
     65k-70k       0.33      0.25      0.29         4
     70k-75k       0.33      1.00      0.50         1
     75k-80k       0.00    

#### Import the pandemic dataset

In [29]:
pandemic_data = pd.read_csv('pandemic_data.csv')

In [30]:
pandemic_data

Unnamed: 0,price_range,region_SF bay area,region_akron / canton,region_albany,"region_albany, NY",region_albuquerque,region_amarillo,"region_amarillo, TX",region_anchorage / mat-su,region_ann arbor,...,paint_color_brown,paint_color_custom,paint_color_green,paint_color_grey,paint_color_orange,paint_color_purple,paint_color_red,paint_color_silver,paint_color_white,paint_color_yellow
0,40k-45k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,15k-20k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,20k-25k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,60k-65k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5k-10k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81404,20k-25k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
81405,15k-20k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
81406,20k-25k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
81407,10k-15k,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


#### Separte the dependent variable from the independent variables in the pandemic data

In [31]:
X_pandemic = pandemic_data.iloc[:,1:].values
y_pandemic = pandemic_data.iloc[:, 0].values

In [34]:
X_pandemic.shape

(81409, 883)

#### Make predictions on the pandemic data using the model

In [35]:
pandemic_prediction = random_forest_model.predict(X_pandemic)

#### Evaluate the performance of the model by checking the precision, recall and overall accuracy of predictions on the pandemic data

In [37]:
report_pandemic = classification_report(y_pandemic, pandemic_prediction)
print(report_pandemic)

              precision    recall  f1-score   support

        0-5k       0.71      0.83      0.77     20824
   100k-105k       0.00      0.00      0.00         1
   105k-110k       0.00      0.00      0.00         2
     10k-15k       0.48      0.45      0.46     13044
   115k-120k       0.00      0.00      0.00         1
   120k-125k       0.00      0.00      0.00         7
   125k-130k       0.00      0.00      0.00         5
   130k-135k       0.00      0.00      0.00         2
     15k-20k       0.40      0.39      0.40      8301
     20k-25k       0.31      0.23      0.26      4740
     25k-30k       0.27      0.26      0.26      3456
     30k-35k       0.24      0.16      0.19      2478
     35k-40k       0.24      0.11      0.15      1823
     40k-45k       0.12      0.07      0.09       884
     45k-50k       0.15      0.07      0.09       658
     50k-55k       0.01      0.00      0.00       345
     55k-60k       0.08      0.02      0.03       218
      5k-10k       0.61    