In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

In [None]:
# Read in CSV file
df = pd.read_csv("Resources/myopia.csv")
df.head()

In [None]:
# Drop the MYOPIC column to allow for unsupervised learning
df = df.drop(["MYOPIC"], axis=1)
df.shape

In [None]:
# Scaling the data so each column affects data equally
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df)

In [None]:
# Using PCA to perform dimensionality reduction
pca = PCA(n_components = 0.90)
myopia_pca = pca.fit_transform(scaled_features)
myopia_pca.shape

In [None]:
# t-SNE to further reduce the dataset dimensions
tsne = TSNE(learning_rate = 250)
tsne_features = tsne.fit_transform(myopia_pca)
tsne_features.shape

In [None]:
# Plotting t-SNE to see if there are distinct clusters
plt.scatter(tsne_features[:,0], tsne_features[:,1])
plt.show()

In [None]:
# Cluster Analysis using K-Means
inertia = []
# Same as k = list(range(1, 11))
k = [1,2,3,4,5,6,7,8,9,10]


# Looking for the best k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(myopia_pca)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Plotting Elbow Curve to determine best k value
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,11))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

### Analysis
Looking at the data as a whole and the elbow curve above, the patients can be clustered into 3 different clusters since there is a distinct elbow seen of k equals 3.