In [1]:
# Import pandas
import pandas as pd

In [2]:
# Load Maryland data
data_to_load = 'maryland_data.csv'
md_df = pd.read_csv(data_to_load)

In [3]:
md_df

Unnamed: 0,res_state,age_group,sex,race
0,MD,18 to 49 years,Female,White
1,MD,18 to 49 years,Female,
2,MD,18 to 49 years,Female,Black
3,MD,0 - 17 years,,
4,MD,18 to 49 years,Male,
...,...,...,...,...
225809,MD,18 to 49 years,Male,White
225810,MD,18 to 49 years,Male,White
225811,MD,18 to 49 years,Male,White
225812,MD,18 to 49 years,Male,White


In [4]:
# Add column to identify all persons with COVID
md_df.insert(4, "has_covid", "Yes")

In [5]:
md_df

Unnamed: 0,res_state,age_group,sex,race,has_covid
0,MD,18 to 49 years,Female,White,Yes
1,MD,18 to 49 years,Female,,Yes
2,MD,18 to 49 years,Female,Black,Yes
3,MD,0 - 17 years,,,Yes
4,MD,18 to 49 years,Male,,Yes
...,...,...,...,...,...
225809,MD,18 to 49 years,Male,White,Yes
225810,MD,18 to 49 years,Male,White,Yes
225811,MD,18 to 49 years,Male,White,Yes
225812,MD,18 to 49 years,Male,White,Yes


In [6]:
# Load Maryland No Covid Data
data_to_load2 = 'md_no_covid.csv'
md_no_covid_df = pd.read_csv(data_to_load2)

In [7]:
md_no_covid_df

Unnamed: 0,res_state,age_group,sex,race,has_covid
0,MD,18 to 49 years,Female,Black,No
1,MD,18 to 49 years,Male,Black,No
2,MD,18 to 49 years,Female,Multiple/Other,No
3,MD,18 to 49 years,Female,White,No
4,MD,18 to 49 years,Female,White,No
...,...,...,...,...,...
5819861,MD,50 to 64 years,Male,Black,No
5819862,MD,18 to 49 years,Female,White,No
5819863,MD,50 to 64 years,Male,Black,No
5819864,MD,50 to 64 years,Male,Black,No


In [8]:
# Merge the COVID and non-COVID data
merged_md_df = pd.concat([md_df, md_no_covid_df])

In [9]:
merged_md_df

Unnamed: 0,res_state,age_group,sex,race,has_covid
0,MD,18 to 49 years,Female,White,Yes
1,MD,18 to 49 years,Female,,Yes
2,MD,18 to 49 years,Female,Black,Yes
3,MD,0 - 17 years,,,Yes
4,MD,18 to 49 years,Male,,Yes
...,...,...,...,...,...
5819861,MD,50 to 64 years,Male,Black,No
5819862,MD,18 to 49 years,Female,White,No
5819863,MD,50 to 64 years,Male,Black,No
5819864,MD,50 to 64 years,Male,Black,No


In [10]:
# Replace NaN values with 0
merged_md_df = merged_md_df.fillna("Not identified")
merged_md_df

Unnamed: 0,res_state,age_group,sex,race,has_covid
0,MD,18 to 49 years,Female,White,Yes
1,MD,18 to 49 years,Female,Not identified,Yes
2,MD,18 to 49 years,Female,Black,Yes
3,MD,0 - 17 years,Not identified,Not identified,Yes
4,MD,18 to 49 years,Male,Not identified,Yes
...,...,...,...,...,...
5819861,MD,50 to 64 years,Male,Black,No
5819862,MD,18 to 49 years,Female,White,No
5819863,MD,50 to 64 years,Male,Black,No
5819864,MD,50 to 64 years,Male,Black,No


In [11]:
# Use LabelEncoder to encode target variable has_covid
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
merged_md_df = merged_md_df.copy()
merged_md_df['has_covid'] = le.fit_transform(merged_md_df['has_covid'])

In [12]:
merged_md_df

Unnamed: 0,res_state,age_group,sex,race,has_covid
0,MD,18 to 49 years,Female,White,1
1,MD,18 to 49 years,Female,Not identified,1
2,MD,18 to 49 years,Female,Black,1
3,MD,0 - 17 years,Not identified,Not identified,1
4,MD,18 to 49 years,Male,Not identified,1
...,...,...,...,...,...
5819861,MD,50 to 64 years,Male,Black,0
5819862,MD,18 to 49 years,Female,White,0
5819863,MD,50 to 64 years,Male,Black,0
5819864,MD,50 to 64 years,Male,Black,0


In [13]:
# Use One Hot Encoding
md_encoded = merged_md_df.dtypes[merged_md_df.dtypes == "object"].index.tolist()

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)
encode_df = pd.DataFrame(enc.fit_transform(merged_md_df[md_encoded]))

encode_df.columns = enc.get_feature_names(md_encoded)
encode_df.head()

Unnamed: 0,res_state_MD,age_group_0 - 17 years,age_group_18 to 49 years,age_group_50 to 64 years,age_group_65+ years,age_group_Not identified,sex_Female,sex_Male,sex_Not identified,race_American Indian/Alaska Native,race_Asian,race_Black,race_Multiple/Other,race_Native Hawaiian/Other Pacific Islander,race_Not identified,race_White
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
# Merge One Hot Encoded features and drop the originals
merged_md_df = merged_md_df.merge(encode_df, left_index=True, right_index=True)
merged_md_df = merged_md_df.drop(md_encoded,1)
merged_md_df

Unnamed: 0,has_covid,res_state_MD,age_group_0 - 17 years,age_group_18 to 49 years,age_group_50 to 64 years,age_group_65+ years,age_group_Not identified,sex_Female,sex_Male,sex_Not identified,race_American Indian/Alaska Native,race_Asian,race_Black,race_Multiple/Other,race_Native Hawaiian/Other Pacific Islander,race_Not identified,race_White
0,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
0,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819861,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5819862,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5819863,0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5819864,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [15]:
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [16]:
# Split our preprocessed data into our features and target arrays
y = merged_md_df["has_covid"].ravel()
X = merged_md_df.copy()
X = X.drop("has_covid", axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [17]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 

rf_model = rf_model.fit(X_train_scaled, y_train)

In [19]:
predictions = rf_model.predict(X_test_scaled)

cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_df

acc_score = accuracy_score(y_test, predictions)

print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1429777,24733
Actual 1,32312,24598


Accuracy Score : 0.9622573473951648
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.98      0.98   1454510
           1       0.50      0.43      0.46     56910

    accuracy                           0.96   1511420
   macro avg       0.74      0.71      0.72   1511420
weighted avg       0.96      0.96      0.96   1511420



In [20]:
importances = rf_model.feature_importances_
importances

sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.7990558298089327, 'race_Not identified'),
 (0.04951710889577458, 'race_White'),
 (0.048976256284005364, 'race_Black'),
 (0.04535538143060004, 'race_Multiple/Other'),
 (0.026726347938401736, 'race_Asian'),
 (0.014607243488617487, 'sex_Not identified'),
 (0.004590002829926135, 'age_group_0 - 17 years'),
 (0.003850519389974838, 'age_group_Not identified'),
 (0.0032990396312528353, 'race_American Indian/Alaska Native'),
 (0.0024652722711143013, 'age_group_18 to 49 years'),
 (0.0005206200970419273, 'age_group_65+ years'),
 (0.0003635538586428463, 'age_group_50 to 64 years'),
 (0.0002770840795918152, 'sex_Male'),
 (0.0002308277849316861, 'race_Native Hawaiian/Other Pacific Islander'),
 (0.0001649122111915937, 'sex_Female'),
 (0.0, 'res_state_MD')]