In [113]:
#import packages
import csv
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
import folium
import matplotlib.pyplot as plt 

In [114]:
#import dataset
!wget -O UnitedStatesCOVID.csv https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv

--2020-07-23 19:36:48--  https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.124.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.124.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1959731 (1.9M) [text/plain]
Saving to: ‘UnitedStatesCOVID.csv’


2020-07-23 19:36:48 (5.58 MB/s) - ‘UnitedStatesCOVID.csv’ saved [1959731/1959731]



In [115]:
#data cleaning and recoding
COVID = pd.read_csv('UnitedStatesCOVID.csv')
COVID['Infected_Week_1'] = COVID[['3/30/20', '3/31/20', '4/1/20', '4/2/20', '4/3/20', '4/4/20', '4/5/20']].sum(axis=1, skipna=True).astype(float)
COVID['Infected_Week_2'] = COVID[['7/13/20', '7/14/20', '7/15/20', '7/16/20', '7/17/20', '7/18/20', '7/19/20']].sum(axis=1, skipna=True).astype(float)
COVID = (COVID[COVID[['Lat','Long_']].notnull().all(1)])
COVID = COVID.drop(COVID[(COVID['Lat'] == 0) & (COVID['Long_'] == 0)].index)
COVID = COVID.drop(COVID[(COVID['Province_State'] == 'Diamond Princess') | (COVID['Province_State'] == 'Grand Princess')].index)
COVID = COVID.reset_index(drop=True)
COVIDClust = COVID[['Province_State', 'Country_Region', 'Combined_Key', 'Lat', 'Long_','Infected_Week_1', 'Infected_Week_2']]
COVIDClust.head()
COVIDClust.shape

(3232, 7)

In [172]:
#Only account for centers with over 500 cases in the week
COVIDWeek_1 = COVIDClust[(COVIDClust['Infected_Week_1'] > 500)]
COVIDWeek_1 = COVIDWeek_1.reset_index(drop=True)

#Data pre-processing 
COVID_Week_1_DB = StandardScaler().fit_transform(COVIDWeek_1[['Lat','Long_']])

#Compute DBSCAN and create labels 
DB1 = DBSCAN(eps=0.15, min_samples=5).fit(COVID_Week_1_DB)
Labels_1 = DB1.labels_ 

UniqueLabels_1 = np.unique(Labels_1) + 1
COVIDWeek_1["Clus_Db1"]=Labels_1 + 1
print(UniqueLabels_2)

[ 0  1  2  3  4  5  6  7  8  9 10]


In [94]:
#Summary statstic for each cluster
#Clus_Db1 denotes outliers 
COVIDWeek_1_Summary = pd.DataFrame(COVIDWeek_1.groupby('Clus_Db1')['Province_State'].apply(lambda x: x.value_counts().index[0]).reset_index())
COVIDWeek_1_Summary['Total_Infected'] = COVIDWeek_1.groupby(['Clus_Db1'])['Infected_Week_1'].sum()
COVIDWeek_1_Summary['Average_Lat'] = COVIDWeek_1.groupby(['Clus_Db1'])['Lat'].mean()
COVIDWeek_1_Summary['Average_Long'] = COVIDWeek_1.groupby(['Clus_Db1'])['Long_'].mean()
COVIDWeek_1_Summary

Unnamed: 0,Clus_Db1,Province_State,Total_Infected,Average_Lat,Average_Long
0,0,Missouri,105165.0,37.398989,-95.635577
1,1,Georgia,25683.0,33.899059,-84.144156
2,2,California,23857.0,37.888611,-121.716242
3,3,California,38102.0,34.668547,-118.630272
4,4,Colorado,21973.0,39.797676,-105.015468
5,5,New York,1121562.0,41.265104,-77.100797
6,6,Florida,53051.0,27.647716,-81.567934
7,7,Indiana,16509.0,39.713088,-85.393127
8,8,Louisiana,65917.0,30.041266,-92.228488
9,9,Tennessee,9037.0,36.074062,-86.099106


In [117]:
#Split dataframe based on clusters 
for i, in zip(UniqueLabels_1):
    if i != 0:
        locals()['COVIDg{}'.format(i)] = COVIDWeek_1[(COVIDWeek_1['Clus_Db1']==i)].reset_index(drop=True)
    else:
        locals()['COVIDoutlier'] = COVIDWeek_1[(COVIDWeek_1['Clus_Db1']==i)].reset_index(drop=True)

In [118]:
#Create graphic (map of COVID case in the US)
colors = ['cadetblue', 'mediumorchid', 'midnightblue', 'mediumspringgreen', 'darkslategray', 'indigo', 'royalblue',
          'seagreen', 'mediumturquoise', 'darkmagenta', 'darkslateblue', 'green']
array = [COVIDoutlier, COVIDg1, COVIDg2, COVIDg3, COVIDg4, COVIDg5, COVIDg6, COVIDg7, COVIDg8, COVIDg9, COVIDg10, COVIDg11]

basemap1 = folium.Map(location=[37, -95], zoom_start=4)
for x, col in zip(array, colors):
    for i in range(0,len(x)):
        folium.Circle(
        location=[x.iloc[i]['Lat'], x.iloc[i]['Long_']],
        radius=x.iloc[i]['Infected_Week_1']*15 if x.iloc[i]['Infected_Week_1'] < 10000 else x.iloc[i]['Infected_Week_1'],
        color=col,
        fill=True,
        fill_color=col).add_to(basemap1)
basemap1

In [173]:
#Only account for centers with over 500 cases in the week
COVIDWeek_2 = COVIDClust[(COVIDClust['Infected_Week_2'] > 500)]
COVIDWeek_2 = COVIDWeek_2.reset_index(drop=True)

#Data pre-processing 
COVID_Week_2_DB = StandardScaler().fit_transform(COVIDWeek_2[['Lat','Long_']])

# Compute DBSCAN and create labels
DB2 = DBSCAN(eps=0.15, min_samples=5).fit(COVID_Week_2_DB)
Labels_2 = DB2.labels_

UniqueLabels_2 = np.unique(Labels_2) + 1
COVIDWeek_2["Clus_Db2"]=Labels_2 + 1
print(UniqueLabels_2)

[ 0  1  2  3  4  5  6  7  8  9 10]


In [163]:
#Summary statstic for each cluster
#Clus_Db1 denotes outliers 
COVIDWeek_2_Summary = pd.DataFrame(COVIDWeek_2.groupby('Clus_Db2')['Province_State'].apply(lambda x: x.value_counts().index[0]).reset_index())
COVIDWeek_2_Summary['Total_Infected'] = COVIDWeek_2.groupby(['Clus_Db2'])['Infected_Week_2'].sum()
COVIDWeek_2_Summary['Average_Lat'] = COVIDWeek_2.groupby(['Clus_Db2'])['Lat'].mean()
COVIDWeek_2_Summary['Average_Long'] = COVIDWeek_2.groupby(['Clus_Db2'])['Long_'].mean()
COVIDWeek_2_Summary

Unnamed: 0,Clus_Db2,Province_State,Total_Infected,Average_Lat,Average_Long
0,0,Arizona,1940708.0,40.167088,-108.57849
1,1,Puerto Rico,62127.0,18.256803,-66.361651
2,2,Texas,19560954.0,36.701771,-86.412862
3,3,Arizona,734338.0,33.310603,-112.507542
4,4,California,641931.0,38.187896,-121.279854
5,5,California,597696.0,33.379695,-116.465349
6,6,Colorado,265697.0,39.417716,-105.761098
7,7,Idaho,98641.0,43.18957,-114.434097
8,8,New Mexico,82179.0,35.420102,-107.346313
9,9,Washington,392293.0,46.181099,-121.202705


In [164]:
#Split dataframe based on clusters 
for i, in zip(UniqueLabels_2):
    if i != 0:
        locals()['COVID2g{}'.format(i)] = COVIDWeek_2[(COVIDWeek_2['Clus_Db2']==i)].reset_index(drop=True)
    else:
        locals()['COVID2outlier'] = COVIDWeek_2[(COVIDWeek_2['Clus_Db2']==i)].reset_index(drop=True)

In [171]:
#Create graphic (map of COVID case in the US)
colors = ['mediumorchid', 'midnightblue', 'cadetblue', 'mediumspringgreen', 'green', 'indigo', 'royalblue',
          'seagreen', 'mediumturquoise', 'darkmagenta', 'darkslateblue']
array = [COVID2outlier, COVID2g1, COVID2g2, COVID2g3, COVID2g4, COVID2g5, COVID2g6, COVIDg7, COVID2g8, COVID2g9, COVID2g10]

basemap2 = folium.Map(location=[37, -95], zoom_start=4)
for x, col in zip(array, colors):
    for i in range(0,len(x)):
        folium.Circle(
        location=[x.iloc[i]['Lat'], x.iloc[i]['Long_']],
        radius=x.iloc[i]['Infected_Week_2'],
        color=col,
        fill=True,
        fill_color=col).add_to(basemap2)
basemap2