In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from collections import Counter

In [None]:
# import cleaned data
df = pd.read_csv('data_cleaned.csv')

In [None]:
clusteringDf = df[['Entity', 'Daily tests', 'Daily cases', 'Daily deaths']].groupby('Entity').sum().reset_index()
clusteringDf['Population'] = df[['Entity', 'Population']].groupby('Entity')['Population'].first().reset_index().drop_duplicates('Entity')['Population']
clusteringDf['positivity_rate'] = (clusteringDf['Daily cases'] / clusteringDf['Daily tests']) * 100
clusteringDf['death_rate'] = (clusteringDf['Daily deaths'] / clusteringDf['Daily cases']) * 100
clusteringDf['mortality_rate'] = (clusteringDf['Daily cases'] / clusteringDf['Population']) * 100
clusteringDf

In [None]:
distortion = []
for k in range(2,100):
    kmeans = KMeans(n_clusters=k,n_init=10)
    kmeans.fit(clusteringDf[['positivity_rate', 'death_rate', 'mortality_rate']])
    distortion.append(kmeans.inertia_)

plt.plot(range(2, 100), distortion, marker = 'x' )
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.show()

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
resultDf = clusteringDf[['positivity_rate', 'death_rate', 'mortality_rate']]
for k in range(2,6):
    kmeans = KMeans(n_clusters=k,n_init=10)
    label = kmeans.fit_predict(clusteringDf[['positivity_rate', 'death_rate', 'mortality_rate']])

    labels = kmeans.labels_
    colors = ['red', 'green', 'blue','yellow','black','magenda']
    resultDf = resultDf.copy()
    resultDf.loc[:, 'Graph_Color'] = [colors[label] for label in labels]
    ax = plt.axes(projection ='3d')
    ax.scatter(resultDf['positivity_rate'], resultDf['death_rate'], resultDf['mortality_rate'], c=resultDf['Graph_Color'])

    centroids = kmeans.cluster_centers_
    cluster_count=Counter(label)
    
    ax.scatter(centroids[:, 0], centroids[:, 1], centroids[:, 2], marker='x', s=200, linewidths=1, color='black')

    # Add labels to the plot
    ax.set_xlabel('Positivity rate')
    ax.set_ylabel('Death rate')
    ax.set_zlabel('Mortality rate')
    ax.set_title(f'{k} clusters with K-Means')

    # Show the plot
    plt.show()

    for i in range(k):
        print('cluster', i+1, 'with:', cluster_count[i], 'entities')
        clusterDf = df.groupby('Entity').describe()[kmeans.labels_==i].loc[:, (slice(None), ['min', 'max', 'mean'])]
        clusterDf2 = clusterDf.copy()
        df3 = clusteringDf[kmeans.labels_==i][['Entity', 'positivity_rate', 'death_rate', 'mortality_rate']].reset_index().drop('index', axis=1)
        df3= df3.set_index('Entity')
        clusterDf = clusterDf.drop(['Cases', 'Deaths', 'Daily tests', 'Daily cases', 'Daily deaths'], axis=1, level=0)
        displayDataframe = pd.concat([clusterDf.xs('mean', level=1, axis=1), clusterDf2[['Daily tests', 'Daily cases', 'Daily deaths']]], axis=1)
        displayDataframe = pd.concat([displayDataframe, df3[['positivity_rate', 'death_rate', 'mortality_rate']]], axis=1)
        # display(displayDataframe) # if we want to display statistics for each country in the cluster
        display(displayDataframe.describe().loc[['min', 'mean', 'max']])