## Heart disease clustering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
def load_heart_disease():
    '''Load and pre-process heart disease data
    
    if processed.hungarian.data file is not present.
    
    it will be downloaded from
    https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data
    
    return: data(DataFrame)
    
    '''
    
    import os
    import requests
    
    
    file_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data'
    file_name = file_url.split('/')[-1]
    
    if not os.path.isfile(file_name):
        print('Downloading from {}'.format(file_url))
        r = requests.get(file_url)
        with open(file_name,'wb') as output_file:
            output_file.write(r.content)
        
    data = pd.read_csv(file_name, 
                   na_values='?', 
                   names=[ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
                            'restecg', 'thalach', 'exang', 'oldpeak', 'slope',
                            'ca', 'thal', 'num'])
    
    data = data[['age', 'trestbps', 'chol', 'thalach']]
    data = data.dropna()
    
    return data

In [None]:
df = load_heart_disease()
df.info()

In [None]:
df.describe()

In [None]:
sns.pairplot(df)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(df)
X_2D = pca.transform(df)

In [None]:
plt.scatter(X_2D[:, 0], X_2D[:, 1])
plt.xlabel('PCA0')
plt.ylabel('PCA1')

## K-means clustering

There are a few questions that we must answer before we start clustering:

1. Do we need to process the data first?
1. How many clusters should we use?
1. Which method should we use?

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
df = load_heart_disease()
df_scaled = df.copy()

scaler = StandardScaler()
scaler.fit(df_scaled)
df_scaled = pd.DataFrame(scaler.transform(df_scaled), columns=df_scaled.columns)

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=54)
kmeans.fit(df_scaled)

df['clusters'] = kmeans.labels_
df['clusters'] = df['clusters'].astype('category') #makes seaborn use qualitative color palette

In [None]:
df.dtypes

In [None]:
centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=df_scaled.columns)
centers_scaled = pd.DataFrame(kmeans.cluster_centers_, columns=df_scaled.columns)
centers

In [None]:
centers_scaled

In [None]:
fig, ax = plt.subplots(figsize=(4,4))
ax = sns.scatterplot(x='age', y='trestbps', hue='clusters', ax=ax, data=df)

centers.plot.scatter(x='age', y='trestbps', ax=ax, marker='x', s=80, color='black');

In [None]:
from collections import Counter
counts = Counter(kmeans.labels_)
print(counts)
fig1, ax1 = plt.subplots()
ax1.pie(counts.values(), labels=[f'Cluster {i}' for i in counts.keys()], autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal');  # Equal aspect ratio ensures that pie is drawn as a circle.
ax1.set_title('Cluster size distribution');

**Note:** Clusters are not really balanced, one cluster takes half of the patients.

In [None]:
f, axes = plt.subplots(kmeans.n_clusters, 1, figsize=(6, 6), sharex=True)

for i, ax in enumerate(axes):
    center = centers_scaled.loc[i, :]
    maxPC = 1.01 * np.max(np.max(np.abs(center)))
    colors = ['C0' if l>0 else 'C1' for l in center]
    ax.axhline(color='#888888')
    center.plot.bar(ax=ax, color=colors)
    ax.set_ylabel(f'Cluster{i}')
    ax.set_ylim(-maxPC, maxPC)
    if i == 0:
        ax.set_title('Cluster centers per feature on standard scale')


**Note:** Typically, it is adviced to back transform the centers to original scale. It seems to me that leaving them on the standard scales helps interpretation.

For example, in the plot above, _Cluster 0_ contains patients with age well bellow the average -> young individuals; _Cluster 1_ on the other hand has patients with age well above average -> older individuals. Finally, _Cluster 2_ has average age patients with high cholesterol and high blood pressure.

In [None]:
f, axes = plt.subplots(1, kmeans.n_clusters, figsize=(9, 3), sharey=True)

overall_max = centers.max().max()

for i, ax in enumerate(axes):
    center = centers.loc[i, :]

    colors = ['C0' if l>0 else 'C1' for l in center]
    ax.axhline(color='#888888')
    center.plot.bar(ax=ax, color=colors)
    ax.set_title(f'Cluster{i}')
    ax.set_ylim(0, overall_max)
    if i == 0:
        ax.set_ylabel('Center, original scale')

plt.tight_layout()

## Check Number of Clusters

We can use the elbow method to check the number of clusters.

In [None]:
from yellowbrick.cluster.elbow import kelbow_visualizer
kelbow_visualizer(KMeans(random_state=54), df_scaled, k=(2,10), 
                  metric='distortion',
                 timings=False)

In [None]:
kelbow_visualizer(KMeans(random_state=54), df_scaled, k=(2,10), 
                  metric='silhouette',
                 timings=False)

In [None]:
kelbow_visualizer(KMeans(random_state=54), df_scaled, k=(2,10), 
                  metric='calinski_harabasz',
                 timings=False)