## Covid Projection Clusters
The purpose of this notebook is to use DBSCAN clustering to explore relationships among the variables in the IHME projection data. <br><br>
The results are meaningless - almost all data was assigned to noise. Maybe try selecting a few key features...

In [1]:
# Import libraries

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN

import seaborn as sns
import matplotlib.pyplot as plt

import datetime
import us

%matplotlib inline

In [6]:
# Read in the IHME projection data

ihme = pd.read_csv("../Raw_Data/Projected_Data/IHME_projections_05_04.csv")
print(ihme.shape)
ihme.head()

(29841, 38)


Unnamed: 0,V1,location_name,date,allbed_mean,allbed_lower,allbed_upper,ICUbed_mean,ICUbed_lower,ICUbed_upper,InvVen_mean,...,icuover_lower,icuover_upper,mobility_data_type,mobility_composite,total_tests_data_type,total_tests,confirmed_infections,est_infections_mean,est_infections_lower,est_infections_upper
0,19692,Abruzzo,2020-02-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
1,19693,Abruzzo,2020-02-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
2,19694,Abruzzo,2020-02-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
3,19695,Abruzzo,2020-02-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,
4,19696,Abruzzo,2020-02-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,,


In [7]:
# Edit the rows

# Make a list of all US state names as strings
US_states = [str(s) for s in us.states.STATES]

# Drop rows for locations outside the 50 US states.
ihme = ihme[ihme['location_name'].isin(US_states)]

# Rename the location column as states
ihme = ihme.rename(columns={'location_name': 'state'})

# reset the index
ihme.reset_index(inplace=True)
ihme.drop(columns=['index'], inplace=True)

# Show the results
print(ihme.shape)
ihme.head()

(10150, 38)


Unnamed: 0,V1,state,date,allbed_mean,allbed_lower,allbed_upper,ICUbed_mean,ICUbed_lower,ICUbed_upper,InvVen_mean,...,icuover_lower,icuover_upper,mobility_data_type,mobility_composite,total_tests_data_type,total_tests,confirmed_infections,est_infections_mean,est_infections_lower,est_infections_upper
0,6700,Alabama,2020-02-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,0.0,0.0,0.0
1,6701,Alabama,2020-02-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,0.0,0.0,0.0
2,6702,Alabama,2020-02-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,0.0,0.0,0.0
3,6703,Alabama,2020-02-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,0.0,0.0,0.0
4,6704,Alabama,2020-02-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,observed,-0.898962,,,,0.0,0.0,0.0


In [8]:
# Edit the date column

# Convert the date column from string to date-time dtype.
ihme["date"] = pd.to_datetime(ihme['date'])

# Select only the date range from 5/4 to 8/4
start_date = datetime.datetime(2020, 5, 4)
end_date = datetime.datetime(2020, 8, 4)
ihme = ihme.loc[(ihme['date'] >= start_date) & (ihme['date'] < end_date)]

# reset the index
ihme.reset_index(inplace=True)
ihme.drop(columns=['index'], inplace=True)

# Show the results
print(ihme.shape)
ihme.head()

(4600, 38)


Unnamed: 0,V1,state,date,allbed_mean,allbed_lower,allbed_upper,ICUbed_mean,ICUbed_lower,ICUbed_upper,InvVen_mean,...,icuover_lower,icuover_upper,mobility_data_type,mobility_composite,total_tests_data_type,total_tests,confirmed_infections,est_infections_mean,est_infections_lower,est_infections_upper
0,6790,Alabama,2020-05-04,366.86046,259.293816,632.05,113.253556,82.99875,191.5875,104.350057,...,0.0,0.0,projected,-17.011234,projected,5403.731657,,1937.894863,287.66721,6541.624112
1,6791,Alabama,2020-05-05,364.766252,247.99625,669.075,113.428261,79.645592,196.7125,104.196799,...,0.0,0.0,projected,-16.99589,projected,5503.686465,,2046.577567,275.079662,7286.58803
2,6792,Alabama,2020-05-06,360.411841,233.9975,684.0375,112.880672,75.488816,203.6125,103.299093,...,0.0,0.0,projected,-16.986032,projected,5603.641273,,2114.82947,256.460108,7899.190696
3,6793,Alabama,2020-05-07,355.381908,216.4325,692.6625,111.799034,70.892368,211.0375,101.920347,...,0.0,0.0,projected,-16.979699,projected,5703.596082,,2169.312493,236.754558,8226.274655
4,6794,Alabama,2020-05-08,349.794176,198.4125,702.025,110.644123,65.295,220.025,100.565215,...,0.0,0.0,projected,-16.975631,projected,5803.55089,,2233.148615,219.442286,8566.868639


In [9]:
# Check for missing values

ihme.isna().sum()

V1                          0
state                       0
date                        0
allbed_mean                 0
allbed_lower                0
allbed_upper                0
ICUbed_mean                 0
ICUbed_lower                0
ICUbed_upper                0
InvVen_mean                 0
InvVen_lower                0
InvVen_upper                0
deaths_mean                 0
deaths_lower                0
deaths_upper                0
admis_mean                  0
admis_lower                 0
admis_upper                 0
newICU_mean                 0
newICU_lower                0
newICU_upper                0
totdea_mean                 0
totdea_lower                0
totdea_upper                0
bedover_mean                0
bedover_lower               0
bedover_upper               0
icuover_mean                0
icuover_lower               0
icuover_upper               0
mobility_data_type          0
mobility_composite          0
total_tests_data_type       0
total_test

In [10]:
# Drop the confirmed infections column with all the missing values

ihme.drop(columns=['confirmed_infections'], inplace=True)

In [11]:
# Check for strings
ihme.dtypes

V1                                int64
state                            object
date                     datetime64[ns]
allbed_mean                     float64
allbed_lower                    float64
allbed_upper                    float64
ICUbed_mean                     float64
ICUbed_lower                    float64
ICUbed_upper                    float64
InvVen_mean                     float64
InvVen_lower                    float64
InvVen_upper                    float64
deaths_mean                     float64
deaths_lower                    float64
deaths_upper                    float64
admis_mean                      float64
admis_lower                     float64
admis_upper                     float64
newICU_mean                     float64
newICU_lower                    float64
newICU_upper                    float64
totdea_mean                     float64
totdea_lower                    float64
totdea_upper                    float64
bedover_mean                    float64


In [12]:
# Select the numeric columns

X = ihme.select_dtypes(include='float64')
X.dtypes

allbed_mean             float64
allbed_lower            float64
allbed_upper            float64
ICUbed_mean             float64
ICUbed_lower            float64
ICUbed_upper            float64
InvVen_mean             float64
InvVen_lower            float64
InvVen_upper            float64
deaths_mean             float64
deaths_lower            float64
deaths_upper            float64
admis_mean              float64
admis_lower             float64
admis_upper             float64
newICU_mean             float64
newICU_lower            float64
newICU_upper            float64
totdea_mean             float64
totdea_lower            float64
totdea_upper            float64
bedover_mean            float64
bedover_lower           float64
bedover_upper           float64
icuover_mean            float64
icuover_lower           float64
icuover_upper           float64
mobility_composite      float64
total_tests             float64
est_infections_mean     float64
est_infections_lower    float64
est_infe

In [13]:
# Scale the data

ss = StandardScaler()
X_scaled = ss.fit_transform(X)

In [14]:
# Instantiate and fit a DBSCAN model

dbscan = DBSCAN(eps=0.01, min_samples=30)
dbscan.fit(X_scaled)

DBSCAN(algorithm='auto', eps=0.01, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=30, n_jobs=None, p=None)

In [15]:
# Check how many clusters were identified: 3 (plus noise)

set(dbscan.labels_)

{-1, 0, 1, 2}

In [16]:
# Check the initial score: pretty bad!

silhouette_score(X_scaled, dbscan.labels_)

-0.4441026985584479

In [17]:
# Create a cluster column

ihme['cluster'] = dbscan.labels_
ihme.head()

Unnamed: 0,V1,state,date,allbed_mean,allbed_lower,allbed_upper,ICUbed_mean,ICUbed_lower,ICUbed_upper,InvVen_mean,...,icuover_lower,icuover_upper,mobility_data_type,mobility_composite,total_tests_data_type,total_tests,est_infections_mean,est_infections_lower,est_infections_upper,cluster
0,6790,Alabama,2020-05-04,366.86046,259.293816,632.05,113.253556,82.99875,191.5875,104.350057,...,0.0,0.0,projected,-17.011234,projected,5403.731657,1937.894863,287.66721,6541.624112,-1
1,6791,Alabama,2020-05-05,364.766252,247.99625,669.075,113.428261,79.645592,196.7125,104.196799,...,0.0,0.0,projected,-16.99589,projected,5503.686465,2046.577567,275.079662,7286.58803,-1
2,6792,Alabama,2020-05-06,360.411841,233.9975,684.0375,112.880672,75.488816,203.6125,103.299093,...,0.0,0.0,projected,-16.986032,projected,5603.641273,2114.82947,256.460108,7899.190696,-1
3,6793,Alabama,2020-05-07,355.381908,216.4325,692.6625,111.799034,70.892368,211.0375,101.920347,...,0.0,0.0,projected,-16.979699,projected,5703.596082,2169.312493,236.754558,8226.274655,-1
4,6794,Alabama,2020-05-08,349.794176,198.4125,702.025,110.644123,65.295,220.025,100.565215,...,0.0,0.0,projected,-16.975631,projected,5803.55089,2233.148615,219.442286,8566.868639,-1


In [18]:
# Check counts for each cluster:
# It's almost all noise!

ihme['cluster'].value_counts()

-1    4339
 2      92
 0      88
 1      81
Name: cluster, dtype: int64

In [19]:
# Examine each cluster

ihme.groupby('cluster').mean().T[[-1, 0, 1, 2]]

cluster,-1,0,1,2
V1,11796.014289,12115.5,15976.0,16985.5
allbed_mean,441.390899,0.009765,0.092568,0.056724
allbed_lower,145.307908,0.000568,0.033302,0.032065
allbed_upper,1348.579817,0.068864,0.227176,0.084783
ICUbed_mean,135.29698,0.003699,0.020769,0.003268
ICUbed_lower,46.867431,0.0,0.001235,0.000543
ICUbed_upper,405.865543,0.037571,0.090756,0.006522
InvVen_mean,120.316894,0.002739,0.015351,0.002395
InvVen_lower,40.847397,0.0,0.000617,0.000543
InvVen_upper,363.675686,0.03125,0.066682,0.005435
