In [174]:
#import packages
import csv
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import sklearn.utils
from sklearn.preprocessing import StandardScaler
import folium
from folium.features import DivIcon
import matplotlib.pyplot as plt 
import requests

In [175]:
#import dataset
!wget -O UnitedStatesCOVID.csv https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv

--2020-08-02 08:12:40--  https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.124.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.124.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2082530 (2.0M) [text/plain]
Saving to: ‘UnitedStatesCOVID.csv’


2020-08-02 08:12:40 (4.68 MB/s) - ‘UnitedStatesCOVID.csv’ saved [2082530/2082530]



In [176]:
#data cleaning and recoding
COVID = pd.read_csv('UnitedStatesCOVID.csv')
COVID['Infected_Week_1'] = COVID[['3/30/20', '3/31/20', '4/1/20', '4/2/20', '4/3/20', '4/4/20', '4/5/20']].sum(axis=1, skipna=True).astype(float)
COVID['Infected_Week_2'] = COVID[['7/13/20', '7/14/20', '7/15/20', '7/16/20', '7/17/20', '7/18/20', '7/19/20']].sum(axis=1, skipna=True).astype(float)
COVID = (COVID[COVID[['Lat','Long_']].notnull().all(1)])
COVID = COVID.drop(COVID[(COVID['Lat'] == 0) & (COVID['Long_'] == 0)].index)
COVID = COVID.drop(COVID[(COVID['Province_State'] == 'Diamond Princess') | (COVID['Province_State'] == 'Grand Princess')].index)
COVID = COVID.reset_index(drop=True)
COVIDClust = COVID[['Province_State', 'Country_Region', 'Combined_Key', 'Lat', 'Long_','Infected_Week_1', 'Infected_Week_2']]
COVIDClust.head()
COVIDClust.shape

(3232, 7)

In [177]:
#Only account for centers with over 500 cases in the week of March 30th to April 5th 
COVIDWeek_1 = COVIDClust[(COVIDClust['Infected_Week_1'] > 500)]
COVIDWeek_1 = COVIDWeek_1.reset_index(drop=True)

#Data pre-processing 
COVID_Week_1_DB = StandardScaler().fit_transform(COVIDWeek_1[['Lat','Long_']])

#Compute DBSCAN and create labels 
DB1 = DBSCAN(eps=0.15, min_samples=5).fit(COVID_Week_1_DB)
Labels_1 = DB1.labels_ 

UniqueLabels_1 = np.unique(Labels_1) + 1
COVIDWeek_1['Clust_Db1]=Labels_1 + 1
print(UniqueLabels_1)

[ 0  1  2  3  4  5  6  7  8  9 10 11]


In [180]:
#Summary statstic for each cluster
#Clus_Db1 denotes outliers 
COVIDWeek_1_Summary = pd.DataFrame(COVIDWeek_1.groupby('Clust_Db1')['Province_State'].apply(lambda x: x.value_counts().index[0]).reset_index())
COVIDWeek_1_Summary = COVIDWeek_1_Summary.rename(columns={'Province_State': 'Most_Centers_State'})
COVIDWeek_1_Summary['Total_Infected'] = COVIDWeek_1.groupby(['Clust_Db1'])['Infected_Week_1'].sum()
COVIDWeek_1_Summary['Average_Lat'] = COVIDWeek_1.groupby(['Clust_Db1'])['Lat'].mean()
COVIDWeek_1_Summary['Average_Long'] = COVIDWeek_1.groupby(['Clust_Db1'])['Long_'].mean()

In [181]:
#Use Google API to reverse geocode and find city of DB SCAN cluster centers
def getAddress(lat,long):
    r = requests.get(f'https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{long}&key=somerandomthingy')
    return r.json()['results'][0]['formatted_address']

addresses = []

for i, j in zip(COVIDWeek_1_Summary['Average_Lat'], COVIDWeek_1_Summary['Average_Long']):
    x = getAddress(i,j)
    addresses.append(x)
addresses  

['Scott Rd, Neodesha, KS 66757, USA',
 'Unnamed Road, Lilburn, GA 30047, USA',
 '21707 Marsh Creek Rd, Brentwood, CA 94513, USA',
 'Unnamed Road, Lake Hughes, CA 93532, USA',
 '2323 W Moffat Pl, Denver, CO 80221, USA',
 '161 Danneker Dr, Williamsport, PA 17701, USA',
 'W County Line Rd, Fort Meade, FL 33841, USA',
 '2525 E 700 N, Rushville, IN 46173, USA',
 '12469 Pershing Rd, Kaplan, LA 70548, USA',
 'Unnamed Road, Watertown, TN 37184, USA',
 '259 Alexander Ln, Royse City, TX 75189, USA',
 '4621 131st Ave SE, Snohomish, WA 98290, USA']

In [182]:
#Add citys to COVIDWeel_1_Summary
copy_address_truncate = ['Neodesha, KS', 'Lilburn, GA', 'Brentwood, CA', 'Lake Hudhes, CA', 'Denver, CO', 'Williamsport, PA', 'FortMeade, FL', 'Rushville, IN', 'Kaplan, LA', 'Watertown, TN', 'Royse City, TX', 'Snohomish, WA']
COVIDWeek_1_Summary['Center_City'] = copy_address_truncate
COVIDWeek_1_Summary

Unnamed: 0,Clust_Db1,Most_Centers_State,Total_Infected,Average_Lat,Average_Long,Center_City
0,0,Missouri,105165.0,37.398989,-95.635577,"Neodesha, KS"
1,1,Georgia,25683.0,33.899059,-84.144156,"Lilburn, GA"
2,2,California,23857.0,37.888611,-121.716242,"Brentwood, CA"
3,3,California,38102.0,34.668547,-118.630272,"Lake Hudhes, CA"
4,4,Colorado,21973.0,39.797676,-105.015468,"Denver, CO"
5,5,New York,1121562.0,41.265104,-77.100797,"Williamsport, PA"
6,6,Florida,53051.0,27.647716,-81.567934,"FortMeade, FL"
7,7,Indiana,16509.0,39.713088,-85.393127,"Rushville, IN"
8,8,Louisiana,65917.0,30.041266,-92.228488,"Kaplan, LA"
9,9,Tennessee,9037.0,36.074062,-86.099106,"Watertown, TN"


In [183]:
#Split dataframe based on clusters 
for i, in zip(UniqueLabels_1):
    if i != 0:
        locals()['COVIDg{}'.format(i)] = COVIDWeek_1[(COVIDWeek_1['Clust_Db1']==i)].reset_index(drop=True)
    else:
        locals()['COVIDoutlier'] = COVIDWeek_1[(COVIDWeek_1['Clust_Db1']==i)].reset_index(drop=True)

In [184]:
#Create graphic (map of COVID case in the US during the week of March 30th to April 5th 
colors = ['plum', 'darkorchid', 'deepskyblue', 'mediumspringgreen', 'aqua', 'cadetblue', 'cornflowerblue',
          'mediumslateblue', 'mediumturquoise', 'darkmagenta', 'lightskyblue', 'mediumseagreen']
array = [COVIDoutlier, COVIDg1, COVIDg2, COVIDg3, COVIDg4, COVIDg5, COVIDg6, COVIDg7, COVIDg8, COVIDg9, COVIDg10, COVIDg11]
text = ['Outliers', 'Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5', 'Cluster6', 'Cluster7', 'Cluster8', 'Cluster9', 'Cluster10', 'Cluster11']

basemap1 = folium.Map(location=[37, -95], zoom_start=4)
for x, col in zip(array, colors):
    for i in range(0,len(x)):
        folium.Circle(
        location=[x.iloc[i]['Lat'], x.iloc[i]['Long_']],
        radius=x.iloc[i]['Infected_Week_1']*15 if x.iloc[i]['Infected_Week_1'] < 10000 else x.iloc[i]['Infected_Week_1'],
        color=col,
        fill=True,
        fill_color=col).add_to(basemap1)
for i, j, in zip(range(0,len(COVIDWeek_1_Summary)), range(0,len(text))):
    folium.Marker(
    location=[COVIDWeek_1_Summary.iloc[i]['Average_Lat'],COVIDWeek_1_Summary.iloc[i]['Average_Long']],
    icon=folium.DivIcon(html=f"""<div style="font-family: lucida bright; font-weight: bold; font-size: 100; color: {'black'}">{(text[j])}</div>""")).add_to(basemap1)

basemap1

In [210]:
#Only account for centers with over 500 cases in the week of July 13th to July 19th
COVIDWeek_2 = COVIDClust[(COVIDClust['Infected_Week_2'] > 500)]
COVIDWeek_2 = COVIDWeek_2.reset_index(drop=True)

#Data pre-processing 
COVID_Week_2_DB = StandardScaler().fit_transform(COVIDWeek_2[['Lat','Long_']])

# Compute DBSCAN and create labels
DB2 = DBSCAN(eps=0.15, min_samples=5).fit(COVID_Week_2_DB)
Labels_2 = DB2.labels_

UniqueLabels_2 = np.unique(Labels_2) + 1
COVIDWeek_2["Clust_Db2"]=Labels_2 + 1
print(UniqueLabels_2)

[ 0  1  2  3  4  5  6  7  8  9 10]


In [211]:
#Summary statstic for each cluster
#Clus_Db2=0 denotes outliers 
COVIDWeek_2_Summary = pd.DataFrame(COVIDWeek_2.groupby('Clust_Db2')['Province_State'].apply(lambda x: x.value_counts().index[0]).reset_index())
COVIDWeek_2_Summary = COVIDWeek_2_Summary.rename(columns={'Province_State': 'Most_Centers_State'})
COVIDWeek_2_Summary['Total_Infected'] = COVIDWeek_2.groupby(['Clust_Db2'])['Infected_Week_2'].sum()
COVIDWeek_2_Summary['Average_Lat'] = COVIDWeek_2.groupby(['Clust_Db2'])['Lat'].mean()
COVIDWeek_2_Summary['Average_Long'] = COVIDWeek_2.groupby(['Clust_Db2'])['Long_'].mean()

In [212]:
#Use Google API to reverse geocode and find city of DB SCAN cluster centers
def getAddress(lat,long):
    r = requests.get(f'https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{long}&key=somerandomthingyagain')
    return r.json()['results'][0]['formatted_address']

addresses = []

for i, j in zip(COVIDWeek_2_Summary['Average_Lat'], COVIDWeek_2_Summary['Average_Long']):
    x = getAddress(i,j)
    addresses.append(x)
addresses  

['Unnamed Road, Meeker, CO 81641, USA',
 'Unnamed Road, Orocovis, 00720, Puerto Rico',
 '13075 Franklin Rd, Franklin, KY 42134, USA',
 '14950 S Airport Rd, Buckeye, AZ 85326, USA',
 '2900 E Peltier Rd, Acampo, CA 95220, USA',
 'San Diego County, CA, USA',
 'US Hwy 285, Jefferson, CO 80456, USA',
 'Unnamed Road, Shoshone, ID 83352, USA',
 'Forest Service Rd 194, New Mexico, USA',
 'Unnamed Road, Washington, USA',
 'Forest Rd 900, Kamas, UT 84036, USA']

In [213]:
#Add citys to COVIDWeel_1_Summary
copy_address_truncate = ['Meeker, Co', 'Orocovis, Puerto Rico', 'Frankin, KY', 'Buckeye, AZ', 'Acampo, CA', 'San Diego, CA', 'Jefferson, CO', 'Shoshone, ID', 'NM (no city)', 'WA (no city)', 'Kamas, UT'] 
COVIDWeek_2_Summary['Center_City'] = copy_address_truncate
COVIDWeek_2_Summary

Unnamed: 0,Clust_Db2,Most_Centers_State,Total_Infected,Average_Lat,Average_Long,Center_City
0,0,Arizona,1940708.0,40.167088,-108.57849,"Meeker, Co"
1,1,Puerto Rico,62127.0,18.256803,-66.361651,"Orocovis, Puerto Rico"
2,2,Texas,19560954.0,36.701771,-86.412862,"Frankin, KY"
3,3,Arizona,734338.0,33.310603,-112.507542,"Buckeye, AZ"
4,4,California,641931.0,38.187896,-121.279854,"Acampo, CA"
5,5,California,597696.0,33.379695,-116.465349,"San Diego, CA"
6,6,Colorado,265697.0,39.417716,-105.761098,"Jefferson, CO"
7,7,Idaho,98641.0,43.18957,-114.434097,"Shoshone, ID"
8,8,New Mexico,82179.0,35.420102,-107.346313,NM (no city)
9,9,Washington,392293.0,46.181099,-121.202705,WA (no city)


In [214]:
#Split dataframe based on clusters 
for i, in zip(UniqueLabels_2):
    if i != 0:
        locals()['COVID2g{}'.format(i)] = COVIDWeek_2[(COVIDWeek_2['Clust_Db2']==i)].reset_index(drop=True)
    else:
        locals()['COVID2outlier'] = COVIDWeek_2[(COVIDWeek_2['Clust_Db2']==i)].reset_index(drop=True)

In [215]:
#Create graphic (map of COVID case in the US during the week of July 13th to July 19th )
colors2 = ['plum', 'darkorchid', 'cadetblue', 'mediumspringgreen', 'aqua', 'mediumseagreen', 'cornflowerblue',
          'mediumslateblue', 'mediumturquoise', 'darkmagenta', 'lightskyblue']
array2 = [COVID2outlier, COVID2g1, COVID2g2, COVID2g3, COVID2g4, COVID2g5, COVID2g6, COVID2g7, COVID2g8, COVID2g9, COVID2g10]
text2 = ['Outliers', 'Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5', 'Cluster6', 'Cluster7', 'Cluster8', 'Cluster9', 'Cluster10']

basemap2 = folium.Map(location=[37, -95], zoom_start=4)
for x, col in zip(array2, colors2):
    for i in range(0,len(x)):
        folium.Circle(
        location=[x.iloc[i]['Lat'], x.iloc[i]['Long_']],
        radius=x.iloc[i]['Infected_Week_2'],
        color=col,
        fill=True,
        fill_color=col).add_to(basemap2)
for i, j, in zip(range(0,len(COVIDWeek_2_Summary)), range(0,len(text))):
    folium.Marker(
    location=[COVIDWeek_2_Summary.iloc[i]['Average_Lat'],COVIDWeek_2_Summary.iloc[i]['Average_Long']] if COVIDWeek_2_Summary.iloc[i]['Clust_Db2'] != 0 else [46.8797, -110.3626],
    icon=folium.DivIcon(html=f"""<div style="font-family: lucida bright; font-weight: bold; font-size: 100; color: {'black'}">{(text2[j])}</div>""")).add_to(basemap2)
basemap2