In [None]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
from scipy.stats import ttest_ind

from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.cluster import KMeans, DBSCAN
from sklearn import metrics

In [None]:
# Load dataset
diabetes_data_binary = pd.read_csv('diabetes_data_upload.csv') 
diabetes_data_floats = pd.read_csv('diabetes-dataset.csv')

In [None]:
# Display original binary dataset
print(diabetes_data_binary.info())
diabetes_data_binary.head(10)

In [None]:
# Map string values to int for the binary dataset

# Convert Yes/No values to 1/0 values
diabetes_data_binary = diabetes_data_binary.applymap(lambda x: 1 if x=='Yes' else x)
diabetes_data_binary = diabetes_data_binary.applymap(lambda x: 0 if x=='No' else x)

# Convert Pos/Neg values to 1/0 values
diabetes_data_binary = diabetes_data_binary.applymap(lambda x: 1 if x=='Positive' else x)
diabetes_data_binary = diabetes_data_binary.applymap(lambda x: 0 if x=='Negative' else x)

# Rename Gender column to Male
diabetes_data_binary = diabetes_data_binary.rename(columns={'Gender': 'Male'})

# Convert Male/Female values to 1/0 values
diabetes_data_binary['Male'] = diabetes_data_binary['Male'].map({'Male': 1, 'Female': 0})

In [None]:
# Display binary dataset after data preparation
print(diabetes_data_binary.info())
diabetes_data_binary

In [None]:
# Display original integer/float dataset
print(diabetes_data_floats.info())
diabetes_data_floats.head(10)

In [None]:
# Count missing values for integers/floats dataset

In [None]:
diabetes_data_floats[diabetes_data_floats['Glucose']==0].shape[0]

In [None]:
diabetes_data_floats[diabetes_data_floats['BloodPressure']==0].shape[0]

In [None]:
diabetes_data_floats[diabetes_data_floats['SkinThickness']==0].shape[0]

In [None]:
diabetes_data_floats[diabetes_data_floats['Insulin']==0].shape[0]

In [None]:
diabetes_data_floats[diabetes_data_floats['BMI']==0].size

In [None]:
# Remove all observations with missing values
diabetes_data_floats = diabetes_data_floats[diabetes_data_floats['Glucose']!=0]
diabetes_data_floats = diabetes_data_floats[diabetes_data_floats['BloodPressure']!=0]
diabetes_data_floats = diabetes_data_floats[diabetes_data_floats['SkinThickness']!=0]
diabetes_data_floats = diabetes_data_floats[diabetes_data_floats['Insulin']!=0]
diabetes_data_floats = diabetes_data_floats[diabetes_data_floats['BMI']!=0]

In [None]:
# Display integer/float dataset after data cleaning
print(diabetes_data_floats.info())
diabetes_data_floats.head(10)

In [None]:
# Computing averages for diabetic and non-diabetic patients using integer/floats dataset

In [None]:
print("Pregnancies")
print("Non-Diabetic Mean:", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['Pregnancies'].mean())
print("Diabetic Mean    :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['Pregnancies'].mean())
print()
print("Non-Diabetic std :", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['Pregnancies'].std())
print("Diabetic std     :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['Pregnancies'].std())
print()
ttest_ind(diabetes_data_floats[diabetes_data_floats['Outcome']==1]['Pregnancies'], diabetes_data_floats[diabetes_data_floats['Outcome']==0]['Pregnancies'], equal_var=False)

In [None]:
print("Glucose")
print("Non-Diabetic Mean:", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['Glucose'].mean())
print("Diabetic Mean    :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['Glucose'].mean())
print()
print("Non-Diabetic std :", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['Glucose'].std())
print("Diabetic std     :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['Glucose'].std())
print()
ttest_ind(diabetes_data_floats[diabetes_data_floats['Outcome']==1]['Glucose'], diabetes_data_floats[diabetes_data_floats['Outcome']==0]['Glucose'], equal_var=False)

In [None]:
print("BloodPressure ")
print("Non-Diabetic Mean:", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['BloodPressure'].mean())
print("Diabetic Mean    :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['BloodPressure'].mean())
print()
print("Non-Diabetic std :", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['BloodPressure'].std())
print("Diabetic std     :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['BloodPressure'].std())
print()
ttest_ind(diabetes_data_floats[diabetes_data_floats['Outcome']==1]['BloodPressure'], diabetes_data_floats[diabetes_data_floats['Outcome']==0]['BloodPressure'], equal_var=False)

In [None]:
print("SkinThickness ")
print("Non-Diabetic Mean:", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['SkinThickness'].mean())
print("Diabetic Mean    :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['SkinThickness'].mean())
print()
print("Non-Diabetic std :", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['SkinThickness'].std())
print("Diabetic std     :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['SkinThickness'].std())
print()
ttest_ind(diabetes_data_floats[diabetes_data_floats['Outcome']==1]['SkinThickness'], diabetes_data_floats[diabetes_data_floats['Outcome']==0]['SkinThickness'], equal_var=False)

In [None]:
print("Insulin ")
print("Non-Diabetic Mean:", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['Insulin'].mean())
print("Diabetic Mean    :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['Insulin'].mean())
print()
print("Non-Diabetic std :", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['Insulin'].std())
print("Diabetic std     :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['Insulin'].std())
print()
ttest_ind(diabetes_data_floats[diabetes_data_floats['Outcome']==1]['Insulin'], diabetes_data_floats[diabetes_data_floats['Outcome']==0]['Insulin'], equal_var=False)

In [None]:
print("BMI ")
print("Non-Diabetic Mean:", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['BMI'].mean())
print("Diabetic Mean    :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['BMI'].mean())
print()
print("Non-Diabetic std :", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['BMI'].std())
print("Diabetic std     :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['BMI'].std())
print()
ttest_ind(diabetes_data_floats[diabetes_data_floats['Outcome']==1]['BMI'], diabetes_data_floats[diabetes_data_floats['Outcome']==0]['BMI'], equal_var=False)

In [None]:
print("Diabetes Pedigree Function")
print("Non-Diabetic Mean:", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['DiabetesPedigreeFunction'].mean())
print("Diabetic Mean    :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['DiabetesPedigreeFunction'].mean())
print()
print("Non-Diabetic std :", diabetes_data_floats[diabetes_data_floats['Outcome']==0]['DiabetesPedigreeFunction'].std())
print("Diabetic std     :", diabetes_data_floats[diabetes_data_floats['Outcome']==1]['DiabetesPedigreeFunction'].std())
print()
ttest_ind(diabetes_data_floats[diabetes_data_floats['Outcome']==1]['DiabetesPedigreeFunction'], diabetes_data_floats[diabetes_data_floats['Outcome']==0]['DiabetesPedigreeFunction'], equal_var=False)

In [None]:
# Visualize feature statistics for non-diabetic and diabetic patients using integer/floats dataset

In [None]:
sns.boxplot(x='Outcome',y='Pregnancies',data=diabetes_data_floats)

In [None]:
sns.boxplot(x='Outcome',y='Glucose',data=diabetes_data_floats)

In [None]:
sns.boxplot(x='Outcome',y='BloodPressure',data=diabetes_data_floats)

In [None]:
sns.boxplot(x='Outcome',y='SkinThickness',data=diabetes_data_floats)

In [None]:
sns.boxplot(x='Outcome',y='Insulin',data=diabetes_data_floats)

In [None]:
sns.boxplot(x='Outcome',y='BMI',data=diabetes_data_floats)

In [None]:
sns.boxplot(x='Outcome',y='DiabetesPedigreeFunction',data=diabetes_data_floats)

In [None]:
sns.boxplot(x='Outcome',y='Age',data=diabetes_data_floats)

In [None]:
# Visualize feature statistics for non-diabetic and diabetic patients using binary dataset

In [None]:
sns.boxplot(x='class',y='Age',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='Polyuria',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='Polydipsia',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='sudden weight loss',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='weakness',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='Polyphagia',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='Genital thrush',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='visual blurring',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='Itching',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='Irritability',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='delayed healing',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='partial paresis',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='muscle stiffness',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='Alopecia',data=diabetes_data_binary)

In [None]:
sns.barplot(x='class',y='Obesity',data=diabetes_data_binary)

In [None]:
#Clustering using all the variables for floats 

In [None]:
# Drop the outcome for X and our Y will be outcome
X1_total = diabetes_data_floats.drop("Outcome",axis = 1)
Y1_total = diabetes_data_floats['Outcome']

In [None]:
#Scale data in X
scaler = StandardScaler() 
scaler.fit(X1_total)
X1_total_scaled = scaler.transform(X1_total) 

In [None]:
#Hierarchical Clustering with single linkage

In [None]:
clustering = linkage(X1_total_scaled,method = 'single',metric = 'euclidean')
clusters = fcluster(clustering, 2,criterion = 'maxclust')

In [None]:
#Plot contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1_total,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silhouette coefficient
adjusted_rand_index = metrics.adjusted_rand_score(Y1_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the data using the clusters formed by Hierarchical clustering
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BMI', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Pregnancies', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BloodPressure', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'DiabetesPedigreeFunction', c = clusters, colormap = plt.cm.brg)

In [None]:
#Hierarchical Clustering with complete linkage

In [None]:
#Complete linkage clustering
clustering = linkage(X1_total_scaled, method = "complete", metric = "euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1_total,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate adjusted rand index and silhouette coefficinet
adjusted_rand_index = metrics.adjusted_rand_score(Y1_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the data using the clusters formed by Hierarchical clustering
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BMI', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Pregnancies', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BloodPressure', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'DiabetesPedigreeFunction', c = clusters, colormap = plt.cm.brg)

In [None]:
#KMeans Clustering iteration = 2 and choosing the centroids randomly

In [None]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 2, random_state = 0).fit(X1_total_scaled)
clusters = clustering.labels_

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1_total,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#calculate the adjusted rand index and silohouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y1_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by KMeans clustering
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BMI', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Pregnancies', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BloodPressure', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'DiabetesPedigreeFunction', c = clusters, colormap = plt.cm.brg)

In [None]:
#KMeans using k-means++ and 20 iteration

In [None]:
clustering = KMeans(n_clusters = 2, init = 'k-means++', n_init = 20,random_state = 0).fit(X1_total_scaled)
clusters = clustering.labels_

In [None]:
#Plot the concingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1_total,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silhouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y1_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by KMeans clustering 
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BMI', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Pregnancies', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BloodPressure', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'DiabetesPedigreeFunction', c = clusters, colormap = plt.cm.brg)

In [None]:
#DBSCAN for eps 1 and min_sample 10

In [None]:
clustering = DBSCAN(eps = 1, min_samples = 10, metric = "euclidean").fit(X1_total_scaled)
clusters = clustering.labels_

In [None]:
#Plot the contingency matrix and we have 3 clusters forms because our min_ sample is small for the eps value
cont_matrix = metrics.cluster.contingency_matrix(Y1_total, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the rand index
adjusted_rand_index = metrics.adjusted_rand_score(Y1_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by 
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BMI', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Pregnancies', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BloodPressure', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'DiabetesPedigreeFunction', c = clusters, colormap = plt.cm.brg)

In [None]:
#DBSCAN with increase in min_sample and eps

In [None]:
clustering = DBSCAN(eps = 3, min_samples = 20, metric = "euclidean").fit(X1_total_scaled)
clusters = clustering.labels_

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1_total, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the values of adjusted rand index and  silhouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y1_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by DBSCAN
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BMI', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Pregnancies', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BloodPressure', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'DiabetesPedigreeFunction', c = clusters, colormap = plt.cm.brg)

In [None]:
#Evaluation metrix for true clusters of data

In [None]:
silhouette_coefficient = metrics.silhouette_score(X1_total, Y1_total, metric = "euclidean")
print(silhouette_coefficient)

In [None]:
#Plot true clusters
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = Y1_total, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = Y1_total, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = Y1_total, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BMI', c = Y1_total, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Pregnancies', c = Y1_total, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'BloodPressure', c = Y1_total, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'DiabetesPedigreeFunction', c = Y1_total, colormap = plt.cm.brg)

In [None]:
#Clustering for majorlying impacting float data variables

In [None]:
X1 = diabetes_data_floats[['Glucose','Age','Insulin','SkinThickness']]
Y1 = diabetes_data_floats['Outcome']

In [None]:
scaler = StandardScaler() 
scaler.fit(X1)
X1_scaled = scaler.transform(X1) 

In [None]:
#Hierarchical Clustering with single linkage

In [None]:
clustering = linkage(X1_scaled,method = 'single',metric = 'euclidean')
clusters = fcluster(clustering, 2,criterion = 'maxclust')

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silohouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y1, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot clusters formed by hierarchical clustering
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)

In [None]:
#Hierarchical Clustering with complete linkage

In [None]:
clustering = linkage(X1_scaled, method = "complete", metric = "euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the values for adjusted rand index and silhouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y1, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by Hierarchical clusters
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)

In [None]:
#KMeans Clustering iteration = 2

In [None]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 2, random_state = 0).fit(X1_scaled)
clusters = clustering.labels_

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(Y1, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by KMeans clustering
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)

In [None]:
#KMeans using k-means++ iteration

In [None]:
clustering = KMeans(n_clusters = 2, init = 'k-means++', n_init = 20,random_state = 0).fit(X1_scaled)
# clustering = KMeans(n_clusters = 4, init = 'random', n_init = 20, random_state = 0).fit(X)
clusters = clustering.labels_

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silhouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y1, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by KMeans clustering
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)

In [None]:
#DBSCAN Clustering 

In [None]:
clustering = DBSCAN(eps = 1, min_samples = 20, metric = "euclidean").fit(X1_scaled)
clusters = clustering.labels_

In [None]:
# Plot contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silohouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y1, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by DBSCAN
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)

In [None]:
#DBSCAN with change eps value and min_sample

In [None]:
clustering = DBSCAN(eps = 2, min_samples = 10, metric = "euclidean").fit(X1_scaled)
clusters = clustering.labels_

In [None]:
# Plot contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y1, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silohouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y1, clusters)
silhouette_coefficient = metrics.silhouette_score(X1_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by DBSCAN
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = clusters, colormap = plt.cm.brg)

In [None]:
# Evaluation Matrix

In [None]:
silhouette_coefficient = metrics.silhouette_score(X1, Y1, metric = "euclidean")
print(silhouette_coefficient)

In [None]:
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Glucose', c = Y1, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'SkinThickness', c = Y1, colormap = plt.cm.brg)
ax = diabetes_data_floats.plot(kind = 'scatter', x = 'Age', y = 'Insulin', c = Y1, colormap = plt.cm.brg)

In [None]:
# Clustering using all variables for binary clusters

In [None]:
X2_total = diabetes_data_binary.drop('class',axis = 1)

In [None]:
Y2_total = diabetes_data_binary["class"]

In [None]:
scaler = StandardScaler() 
scaler.fit(X2_total)
X2_total_scaled = scaler.transform(X2_total) 

In [None]:
#Hierarchical Clustering with single linkage

In [None]:
clustering = linkage(X2_total_scaled,method = 'single',metric = 'euclidean')
clusters = fcluster(clustering, 2,criterion = 'maxclust')

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y2_total,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silhouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y2_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by Hierarchical clustering
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = clusters, colormap = plt.cm.brg)

In [None]:
#Hierarchical Clustering with complete linkage

In [None]:
clustering = linkage(X2_total_scaled, method = "complete", metric = "euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y2_total,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silhouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y2_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = clusters, colormap = plt.cm.brg)

In [None]:
# KMeans clustering by randon choice of centroids and 1 iteration

In [None]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(X2_total_scaled)
clusters = clustering.labels_

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y2_total,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silhouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y2_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters using the clusters formed by KMeans clusetring
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = clusters, colormap = plt.cm.brg)

In [None]:
#KMeans clusteirng using 10 iteration and the centroids are chosen iteratively

In [None]:
clustering = KMeans(n_clusters = 2, init = 'k-means++', n_init = 10,random_state = 0).fit(X2_total_scaled)
clusters = clustering.labels_

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y2_total,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silhouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y2_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by KMeans clustering
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = clusters, colormap = plt.cm.brg)

In [None]:
#DBSCAN using eps = 1 and min_sample = 20

In [None]:
clustering = DBSCAN(eps = 1, min_samples = 10, metric = "euclidean").fit(X2_total_scaled)
clusters = clustering.labels_
print(clusters)

In [None]:
#Plot the contingency matrix and we notice that we have 3 clusters because the min_Sample value is low for eps value
cont_matrix = metrics.cluster.contingency_matrix(Y2_total,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(Y2_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = clusters, colormap = plt.cm.brg)

In [None]:
#DBSCAN using eps = 2 and min_Sample = 15 and we achive 2 clusters

In [None]:
clustering = DBSCAN(eps = 2, min_samples = 15, metric = "euclidean").fit(X2_total_scaled)
clusters = clustering.labels_
print(clusters)

In [None]:
cont_matrix = metrics.cluster.contingency_matrix(Y2_total,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(Y2_total, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_total_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = clusters, colormap = plt.cm.brg)

In [None]:
#Evaluation metrics

In [None]:
silhouette_coefficient = metrics.silhouette_score(X2_total_scaled, Y2_total, metric = "euclidean")
print(silhouette_coefficient)

In [None]:
#Plot true clusters
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = Y2_total, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = Y2_total, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = Y2_total, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = Y2_total, colormap = plt.cm.brg)

In [None]:
#Clustering for Binary Data

In [None]:
#Choosing the variables that have higher impact on diabetic vs non-diabetic for binary values

In [None]:
X2 = diabetes_data_binary[["Polyuria", "Polydipsia","sudden weight loss",'Irritability','partial paresis']]

In [None]:
Y2 = diabetes_data_binary["class"]

In [None]:
scaler = StandardScaler() 
scaler.fit(X2)
X2_scaled = scaler.transform(X2) 

In [None]:
#Hierarchical Clustering with single linkage

In [None]:
clustering = linkage(X2_scaled,method = 'single',metric = 'euclidean')
clusters = fcluster(clustering, 2,criterion = 'maxclust')

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y2,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silhouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y2, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by Hierarchical clustering
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = clusters, colormap = plt.cm.brg)

In [None]:
#Hierarchical Clustering with complete linkage

In [None]:
clustering = linkage(X2_scaled, method = "complete", metric = "euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')

In [None]:
#Plot teh contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y2,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(Y2, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by Hierarchical clustering
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = clusters, colormap = plt.cm.brg)

In [None]:
#KMeans clustering with random centroids and 1 iteration

In [None]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(X2_scaled)
clusters = clustering.labels_

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y2,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(Y2, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by KMeans clustering
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = clusters, colormap = plt.cm.brg)

In [None]:
#KMeans with 20 iterations and the centroids are chosen iteratively

In [None]:
clustering = KMeans(n_clusters = 2, init = 'k-means++', n_init = 20,random_state = 0).fit(X2_scaled)
clusters = clustering.labels_

In [None]:
#Plot the contingency matrix
cont_matrix = metrics.cluster.contingency_matrix(Y2,clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
#Calculate the adjusted rand index and silhouette coeff
adjusted_rand_index = metrics.adjusted_rand_score(Y2, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by KMeans
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = Y2, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = Y2, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = Y2, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = Y2, colormap = plt.cm.brg)

In [None]:
#DBSCAN for eps = 1 and min_Sample = 30

In [None]:
clustering = DBSCAN(eps = 1, min_samples = 30, metric = "euclidean").fit(X2_scaled)
clusters = clustering.labels_
print(clusters)

In [None]:
#Plot the contingency matrix and we see that we have 6 clusters formed for the following eps and min_sample values
cont_matrix = metrics.cluster.contingency_matrix(Y2, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(Y2, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot the clusters formed by DBSCAN
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = clusters, colormap = plt.cm.brg)

In [None]:
# DBSCAN modified to form two clusters

In [None]:
clustering = DBSCAN(eps = 2, min_samples = 70, metric = "euclidean").fit(X2_scaled)
clusters = clustering.labels_
print(clusters)

In [None]:
cont_matrix = metrics.cluster.contingency_matrix(Y2, clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()

In [None]:
adjusted_rand_index = metrics.adjusted_rand_score(Y2, clusters)
silhouette_coefficient = metrics.silhouette_score(X2_scaled, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])

In [None]:
#Plot clusters formed by DBSCAN
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = clusters, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = Y2, colormap = plt.cm.brg)

In [None]:
#Evaluation Matrix for true clusters for binary data

In [None]:
silhouette_coefficient = metrics.silhouette_score(X2_scaled, Y2, metric = "euclidean")
print(silhouette_coefficient)

In [None]:
# Plot true clusters
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Polyuria', c = Y2, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = "sudden weight loss", c = Y2, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'Irritability', c = Y2, colormap = plt.cm.brg)
ax = diabetes_data_binary.plot(kind = 'scatter', x = 'Polydipsia', y = 'partial paresis', c = Y2, colormap = plt.cm.brg)