In [33]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [20]:
#initialise random forest model, using best parameters from CV from v1 rf notebook
rf_best_params = RandomForestClassifier(n_estimators = 987,
 min_samples_split = 10,
 min_samples_leaf = 3,
 max_depth = 100,
 criterion = 'entropy',
 bootstrap = True)

In [16]:
# Take the processing function from hackathon_lgbm.ipynb to process all data in the same way
def prepare_train_test(X):
    X = X.dropna()
    X = X.copy().query('gender != "None"')
    X = X.query('corona_result != "other"')
    X['test_indication'] = X['test_indication'].map({'Contact with confirmed': 1, 'Other': 0, 'Abroad': 0})
    X['age_60_and_above'] = X['age_60_and_above'].map({'Yes': 1, 'No': 0})
    X['gender'] = X['gender'].map({'male': 1, 'female': 0})
    X['corona_result'] = X['corona_result'].map({'positive': 1, 'negative': 0})
    X = X.copy().query('cough != "None"')
    X = X.copy().query('fever != "None"')
    X = X.copy().query('sore_throat != "None"')
    X = X.copy().query('shortness_of_breath != "None"')
    X = X.copy().query('head_ache != "None"')
    X['cough'] = X['cough'].astype(int)
    X['fever'] = X['fever'].astype(int)
    X['sore_throat'] = X['sore_throat'].astype(int)
    X['shortness_of_breath'] = X['shortness_of_breath'].astype(int)
    X['head_ache'] = X['head_ache'].astype(int)
    X['gender'] = X['gender'].astype(int)
    X = X.drop(['test_date'],axis = 1)
    X = X.dropna()
    X1 = X.drop(['corona_result'],axis = 1)
    y1 = X.copy()['corona_result']
    X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.2,random_state = 2022, stratify = y1)
    return X_train1, X_test1, y_train1, y_test1

## Load in and create all test and train datasets

In [40]:
#load in and prepare the data March-Apr 2020
mar_apr_2020_data = pd.read_csv("data/corona_tested_individuals_ver_006_march_april.english.csv")
X_train1, X_test1, y_train1, y_test1  = prepare_train_test(mar_apr_2020_data)

  mar_apr_2020_data = pd.read_csv("data/corona_tested_individuals_ver_006_march_april.english.csv")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['test_indication'] = X['test_indication'].map({'Contact with confirmed': 1, 'Other': 0, 'Abroad': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age_60_and_above'] = X['age_60_and_above'].map({'Yes': 1, 'No': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-

In [41]:
#load in and prepare the data March-Apr 2020
sep_nov_2020_data = pd.read_csv("data/corona_tested_individuals_ver_0083_september_november.english.csv")
X_train2, X_test2, y_train2, y_test2 = prepare_train_test(sep_nov_2020_data)

  sep_nov_2020_data = pd.read_csv("data/corona_tested_individuals_ver_0083_september_november.english.csv")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['test_indication'] = X['test_indication'].map({'Contact with confirmed': 1, 'Other': 0, 'Abroad': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age_60_and_above'] = X['age_60_and_above'].map({'Yes': 1, 'No': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org

In [42]:
#load in and prepare the data for March 2021
march_2021_data = pd.read_csv("data/corona_tests_2021_mar.csv")
X_train3, X_test3, y_train3, y_test3 = prepare_train_test(march_2021_data)

  march_2021_data = pd.read_csv("data/corona_tests_2021_mar.csv")


In [43]:
sept_2021_data = pd.read_csv('data/corona_tests_2021_sep.csv')
X_train4, X_test4, y_train4, y_test4 = prepare_train_test(sept_2021_data)

  sept_2021_data = pd.read_csv('data/corona_tests_2021_sep.csv')


## RF Model trained on Mar-Apr 2020 (8 Features)

In [156]:
#fit the rf model to the March-April 2020 data with 8 features
rf_best_params.fit(X_train1, y_train1)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [39]:
#print roc_auc score on the predicted probabilities for each class March-Apr 2020
score1 = roc_auc_score(y_test1, rf_best_params.predict_proba(X_test1)[::,1])
score1

0.9018367994607727

In [44]:
#print roc_auc score on the predicted probabilities for each class Sept-Nov 2020
score2 = roc_auc_score(y_test2, rf_best_params.predict_proba(X_test2)[::,1])
score2

0.8178019627165154

In [45]:
#print roc_auc score on the predicted probabilities for each class March 2021
score3 = roc_auc_score(y_test3, rf_best_params.predict_proba(X_test3)[::,1])
score3

0.8374834562435703

In [46]:
#print roc_auc score on the predicted probabilities for each class September 2021
score4 = roc_auc_score(y_test4, rf_best_params.predict_proba(X_test4)[::,1])
score4

0.6665488853721077

## RF Model trained on March-Apr 2020 (5 features)

In [53]:
#fit the model to the same data with 5 features instead
rf_best_params.fit(X_train1[['cough','fever','test_indication','gender','age_60_and_above']], y_train1)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [55]:
#print roc_auc score on the predicted probabilities for each class March-Apr 2020
score5 = roc_auc_score(y_test1, rf_best_params.predict_proba(X_test1[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score5

0.8740742092969918

In [56]:
#print roc_auc score on the predicted probabilities for each class March-Apr 2020
score6 = roc_auc_score(y_test2, rf_best_params.predict_proba(X_test2[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score6

0.7923815121970326

In [57]:
#print roc_auc score on the predicted probabilities for each class March-Apr 2020
score7 = roc_auc_score(y_test3, rf_best_params.predict_proba(X_test3[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score7

0.8003431806009886

In [58]:
#print roc_auc score on the predicted probabilities for each class March-Apr 2020
score8 = roc_auc_score(y_test4, rf_best_params.predict_proba(X_test4[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score8

0.6455053672816132

## Refit and test the model for each dataset (5 features)

In [59]:
#fit the model to the same data with 5 features instead for sept-nov 2020
rf_best_params.fit(X_train2[['cough','fever','test_indication','gender','age_60_and_above']], y_train2)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [60]:
#print roc_auc score on the predicted probabilities for each class sept-nov 2020
score9 = roc_auc_score(y_test2, rf_best_params.predict_proba(X_test2[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score9

0.7942190318389625

In [61]:
#fit the model to the same data with 5 features instead for march 2021
rf_best_params.fit(X_train3[['cough','fever','test_indication','gender','age_60_and_above']], y_train3)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [62]:
#print roc_auc score on the predicted probabilities for each class march 2021
score10 = roc_auc_score(y_test3, rf_best_params.predict_proba(X_test3[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score10

0.8030174415020479

In [63]:
#fit the model to the same data with 5 features instead for sept 2021
rf_best_params.fit(X_train4[['cough','fever','test_indication','gender','age_60_and_above']], y_train4)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [64]:
#print roc_auc score on the predicted probabilities for each class september 2021
score11 = roc_auc_score(y_test3, rf_best_params.predict_proba(X_test3[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score11

0.7905958979190025

## Refit and test the model for each dataset (8 features)

In [153]:
#fit the rf model to the sept-nov 2020 data with 8 features
rf_best_params.fit(X_train2, y_train2)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [66]:
#print roc_auc score on the predicted probabilities for each class sept-nov 2020
score12 = roc_auc_score(y_test2, rf_best_params.predict_proba(X_test2)[::,1])
score12

0.8188787151867022

In [67]:
#fit the rf model to the march 21 data with 8 features
rf_best_params.fit(X_train3, y_train3)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [68]:
#print roc_auc score on the predicted probabilities for each class sept-nov 2020
score13 = roc_auc_score(y_test3, rf_best_params.predict_proba(X_test3)[::,1])
score13

0.8383086212476737

In [69]:
#fit the rf model to the sept 21 data with 8 features
rf_best_params.fit(X_train4, y_train4)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [70]:
#print roc_auc score on the predicted probabilities for each class sept-nov 2020
score14 = roc_auc_score(y_test4, rf_best_params.predict_proba(X_test4)[::,1])
score14

0.701663852644318

## 3rd, 4th and 5th wave data

In [103]:
wave3_data.head()

Unnamed: 0.1,Unnamed: 0,test_date,cough,fever,sore_throat,shortness_of_breath,head_ache,age_binary,gender_binary,contact,corona_result_binary
0,4724494,2021-01-20,0,0,0,0,0,0.0,0.0,0,0.0
1,4724495,2021-01-20,0,0,0,0,0,0.0,1.0,0,1.0
2,4724496,2021-01-20,0,0,1,0,0,0.0,0.0,1,1.0
3,4724497,2021-01-20,0,0,0,0,0,0.0,0.0,0,0.0
4,4724498,2021-01-20,0,0,0,0,0,0.0,1.0,0,0.0


In [120]:
# Take the processing function from hackathon_lgbm.ipynb to process all data in the same way
def prepare_train_test(X):
    X = X.dropna()
    X = X.drop(['Unnamed: 0'], axis = 1)
    X = X.drop(['test_date'], axis = 1)
    X = X.rename(columns = {"age_binary":"age_60_and_above", 'gender_binary':'gender', "corona_result_binary":'corona_result', "contact":'test_indication'})
    for i in X.columns:
        X[i] = X[i].astype(int)
    X1 = X.drop(['corona_result'],axis = 1)
    y1 = X.copy()['corona_result']
    X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size = 0.2,random_state = 2022, stratify = y1)
    return X_train1, X_test1, y_train1, y_test1

In [121]:
#load in the 3rd wave data and prepare test train split
wave3_data = pd.read_csv('data/data_processed/third_wave_alpha_processed.csv')
X_train_wave_3, X_test_wave_3, y_train_wave_3, y_test_wave_3  = prepare_train_test(wave3_data)

In [123]:
#load in the 4th wave data and prepare test train split
wave4_data = pd.read_csv('data/data_processed/fourth_wave_delta_processed.csv')
X_train_wave4, X_test_wave_4, y_train_wave_4, y_test_wave_4  = prepare_train_test(wave4_data)

In [124]:
#load in the 5th wave data and prepare test train split
wave5_data = pd.read_csv('data/data_processed/fifth_wave_omicron_processed.csv')
X_train_wave5, X_test_wave_5, y_train_wave_5, y_test_wave_5  = prepare_train_test(wave5_data)

## Testing model trained in March-Apr 2020 (8 features) on wave data

In [86]:
#train on 5 features march-apr 2020
rf_best_params.fit(X_train1, y_train1)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [125]:
#print roc_auc score on the predicted probabilities for wave 3
score15 = roc_auc_score(y_test_wave_3, rf_best_params.predict_proba(X_test_wave_3)[::,1])
score15

0.7987492247291981

In [126]:
#print roc_auc score on the predicted probabilities for wave 4
score16 = roc_auc_score(y_test_wave_4, rf_best_params.predict_proba(X_test_wave_4)[::,1])
score16

0.6546163585633039

In [127]:
#print roc_auc score on the predicted probabilities for wave 5
score17 = roc_auc_score(y_test_wave_5, rf_best_params.predict_proba(X_test_wave_5)[::,1])
score17

0.4999527321545832

## Testing model trained in March-Apr 2020 (5 features) on wave data

In [128]:
#fit the model to the same data with 5 features instead for march-apr 2020
rf_best_params.fit(X_train1[['cough','fever','test_indication','gender','age_60_and_above']], y_train1)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [129]:
#print roc_auc score on the predicted probabilities for wave 3
score18 = roc_auc_score(y_test_wave_3, rf_best_params.predict_proba(X_test_wave_3[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score18

0.769252962867366

In [130]:
#print roc_auc score on the predicted probabilities for wave 4
score19 = roc_auc_score(y_test_wave_4, rf_best_params.predict_proba(X_test_wave_4[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score19

0.6380053052444942

In [131]:
#print roc_auc score on the predicted probabilities for wave 5
score20 = roc_auc_score(y_test_wave_5, rf_best_params.predict_proba(X_test_wave_5[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score20

0.497559103934278

## Refit model for each wave and test (5 features)

In [132]:
#fit the model to the same data with 5 features wav3
rf_best_params.fit(X_train_wave_3[['cough','fever','test_indication','gender','age_60_and_above']], y_train_wave_3)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [133]:
#print roc_auc score on the predicted probabilities for wave 3
score21 = roc_auc_score(y_test_wave_3, rf_best_params.predict_proba(X_test_wave_3[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score21

0.7742067250792514

In [134]:
#fit the model to the same data with 5 features wav4
rf_best_params.fit(X_train_wave4[['cough','fever','test_indication','gender','age_60_and_above']], y_train_wave_4)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [135]:
#print roc_auc score on the predicted probabilities for wave 4
score22 = roc_auc_score(y_test_wave_4, rf_best_params.predict_proba(X_test_wave_4[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score22

0.6745252579076327

In [136]:
#fit the model to the same data with 5 features wav5
rf_best_params.fit(X_train_wave5[['cough','fever','test_indication','gender','age_60_and_above']], y_train_wave_5)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [137]:
#print roc_auc score on the predicted probabilities for wave5
score23 = roc_auc_score(y_test_wave_5, rf_best_params.predict_proba(X_test_wave_5[['cough','fever','test_indication','gender','age_60_and_above']])[::,1])
score23

0.5906389811808337

## Refit model for each wave and test (8 features)

In [150]:
#fit the model to the same data with 8 features wav3
rf_best_params.fit(X_train_wave_3, y_train_wave_3)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [139]:
#print roc_auc score on the predicted probabilities for wave 3
score24 = roc_auc_score(y_test_wave_3, rf_best_params.predict_proba(X_test_wave_3)[::,1])
score24

0.8037194830386407

In [146]:
#fit the model to the same data with 8 features wav4
rf_best_params.fit(X_train_wave4, y_train_wave_4)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [141]:
#print roc_auc score on the predicted probabilities for wave 4
score25 = roc_auc_score(y_test_wave_4, rf_best_params.predict_proba(X_test_wave_4)[::,1])
score25

0.6919138743235883

In [142]:
#fit the model to the same data with 8 features wav5
rf_best_params.fit(X_train_wave5, y_train_wave_5)

RandomForestClassifier(criterion='entropy', max_depth=100, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=987)

In [143]:
#print roc_auc score on the predicted probabilities for wave 5
score26 = roc_auc_score(y_test_wave_5, rf_best_params.predict_proba(X_test_wave_5)[::,1])
score26

0.5948648062081

In [145]:
#show the variable importance in the rf model
wave5_features_df  = pd.DataFrame(rf_best_params.feature_importances_, index=X_train1.columns).sort_values(by=0, ascending=False)

In [147]:
wave5_features_df

Unnamed: 0,0
test_indication,0.298924
age_60_and_above,0.276222
cough,0.195981
head_ache,0.067062
sore_throat,0.061388
fever,0.05578
gender,0.038194
shortness_of_breath,0.006448


In [148]:
#show the variable importance in the rf model
wave4_features_df = pd.DataFrame(rf_best_params.feature_importances_, index=X_train1.columns).sort_values(by=0, ascending=False)

In [149]:
wave4_features_df

Unnamed: 0,0
test_indication,0.273066
fever,0.257491
head_ache,0.16969
cough,0.168819
sore_throat,0.056152
age_60_and_above,0.049546
shortness_of_breath,0.017001
gender,0.008234


In [151]:
#show the variable importance in the rf model
wave3_features_df = pd.DataFrame(rf_best_params.feature_importances_, index=X_train1.columns).sort_values(by=0, ascending=False)

In [152]:
wave3_features_df

Unnamed: 0,0
test_indication,0.489824
cough,0.173171
head_ache,0.160165
fever,0.110021
sore_throat,0.046989
shortness_of_breath,0.012856
age_60_and_above,0.00535
gender,0.001623


In [154]:
#show the variable importance in the rf model
wave2_features_df = pd.DataFrame(rf_best_params.feature_importances_, index=X_train1.columns).sort_values(by=0, ascending=False)

In [155]:
wave2_features_df

Unnamed: 0,0
test_indication,0.511201
head_ache,0.171347
fever,0.160565
cough,0.071358
sore_throat,0.05865
shortness_of_breath,0.020791
gender,0.003639
age_60_and_above,0.002449


In [158]:
#show the variable importance in the rf model
wave1_features_df = pd.DataFrame(rf_best_params.feature_importances_, index=X_train1.columns).sort_values(by=0, ascending=False)

In [159]:
wave1_features_df

Unnamed: 0,0
test_indication,0.453599
head_ache,0.16131
fever,0.146919
sore_throat,0.08637
cough,0.072601
shortness_of_breath,0.062703
age_60_and_above,0.008295
gender,0.008203
