In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, pairwise_distances
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize
import seaborn as sns

In [None]:
# from sklearn import cluster
# from sklearn import metrics
# from sklearn.metrics import 
# matplotlib.style.use('ggplot') 

In [6]:
cfe = pd.read_csv('../data/covid_fire_earthquake_summary.csv')
cfe.head()

Unnamed: 0,fips,county,province_state,covid_last_update,county_latitude,county_longitudue,covid_confirmed,covid_death,covid_recoverd,covid_active,...,covid_case_fatality_ratio,county_population,covid_death_per_capita,covid_confirmed_per_capita,covid_active_cases_per_capita,fires_per_county_in_2020,active_fires_per_county,fire_score,earthquakes_per_county_in_2020,earthquakes_score
0,6001.0,Alameda,California,2020-10-17 04:24:12,37.646294,-121.892927,22408,439,0,21969.0,...,1.959122,1671329,0.000263,0.013407,0.013145,3,0,4,0,0
1,6003.0,Alpine,California,2020-10-17 04:24:12,38.596786,-119.822359,3,0,0,3.0,...,0.0,1129,0.0,0.002657,0.002657,0,0,0,0,0
2,6005.0,Amador,California,2020-10-17 04:24:12,38.445831,-120.65696,309,16,0,293.0,...,5.177994,39752,0.000402,0.007773,0.007371,2,0,2,0,0
3,6007.0,Butte,California,2020-10-17 04:24:12,39.667278,-121.600525,3000,50,0,2950.0,...,1.666667,219186,0.000228,0.013687,0.013459,9,1,11,0,0
4,6009.0,Calaveras,California,2020-10-17 04:24:12,38.205371,-120.552913,331,16,0,315.0,...,4.833837,45905,0.000349,0.007211,0.006862,3,0,3,0,0


In [7]:
cfe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   fips                            58 non-null     float64
 1   county                          58 non-null     object 
 2   province_state                  58 non-null     object 
 3   covid_last_update               58 non-null     object 
 4   county_latitude                 58 non-null     float64
 5   county_longitudue               58 non-null     float64
 6   covid_confirmed                 58 non-null     int64  
 7   covid_death                     58 non-null     int64  
 8   covid_recoverd                  58 non-null     int64  
 9   covid_active                    58 non-null     float64
 10  covid_indidence_rate            58 non-null     float64
 11  covid_case_fatality_ratio       58 non-null     float64
 12  county_population               58 non

In [8]:
cfe.columns

Index(['fips', 'county', 'province_state', 'covid_last_update',
       'county_latitude', 'county_longitudue', 'covid_confirmed',
       'covid_death', 'covid_recoverd', 'covid_active', 'covid_indidence_rate',
       'covid_case_fatality_ratio', 'county_population',
       'covid_death_per_capita', 'covid_confirmed_per_capita',
       'covid_active_cases_per_capita', 'fires_per_county_in_2020',
       'active_fires_per_county', 'fire_score',
       'earthquakes_per_county_in_2020', 'earthquakes_score'],
      dtype='object')

## \#Keeping only numeric columns

In [9]:
X = cfe.drop(columns = ['fips', 'county', 'province_state', 'covid_last_update'])

## \#Scaling

In [10]:
ss = StandardScaler()
X_sc = ss.fit_transform(X)

## \#KMeans

In [13]:
def optimize_kmeans(krange, scaled_X):
    
    max_score= -1
    
    for i in range(2,krange):
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(scaled_X)
        score = silhouette_score(scaled_X, kmeans.labels_)
        if score > max_score:
            max_score = score
            k = i    
    print(f'StandardScaler silhouette score: {max_score} for optimal k of {k}')

In [14]:
optimize_kmeans(10, X_sc)

StandardScaler silhouette score: 0.42945415594366265 for optimal k of 2


In [15]:
def kmeans_range(krange, scaled_X):
    
    for i in range(2,krange):
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(scaled_X)
        score = silhouette_score(scaled_X, kmeans.labels_)
        k=i
        print(f'StandardScaler silhouette score: {score} for optimal k of {k}')

In [17]:
kmeans_range(10, X_sc)

StandardScaler silhouette score: 0.42945415594366265 for optimal k of 2
StandardScaler silhouette score: 0.34145168294328876 for optimal k of 3
StandardScaler silhouette score: 0.2916211486466191 for optimal k of 4
StandardScaler silhouette score: 0.27796020085640616 for optimal k of 5
StandardScaler silhouette score: 0.17114562258838356 for optimal k of 6
StandardScaler silhouette score: 0.19488073716278653 for optimal k of 7
StandardScaler silhouette score: 0.21173223514611286 for optimal k of 8
StandardScaler silhouette score: 0.19127984137207238 for optimal k of 9


## \#DBSCAN

In [21]:
def DBSCAN_optimize(X_scaled, min_sample_range): 
   
    max_score=-1     

    for eps in np.linspace(.2, 5, 50):
        for minsamples in range(2, min_sample_range):
            
            dbscan = DBSCAN(eps=eps, min_samples=minsamples)
            dbscan.fit(X_scaled)
            
            if len(set(dbscan.labels_)) > 1:
               
                score = silhouette_score(X_scaled, dbscan.labels_)
                nclusters = len(set(dbscan.labels_))
                
                if score > max_score:
                    max_score = score
                    best_eps = eps
                    best_minsamples = minsamples
                    best_clusters = nclusters
    
    print(f'Opitimal silhoute score: {round(max_score, 2)} with eps of: {round(best_eps, 2)}')
    print(f'min_samples of: {best_minsamples} and {best_clusters} clusters.')
    return 

In [22]:
DBSCAN_optimize(X_sc, 10)

Opitimal silhoute score: 0.61 with eps of: 4.71
min_samples of: 2 and 2 clusters.
