# UNSUPERVISED MACHINE LEARNING

In [None]:
# pip install -U scikit-learn-extra

In [11]:
import pandas as pd
import numpy as np

# preprocessing and split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler

# plotting
import matplotlib.pyplot as plt
import plotly.express as px

# clustering
from sklearn.cluster import KMeans


In [2]:
# read csv
df = pd.read_csv("myopia.csv")
df.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [3]:
# check for NA
df.isnull().sum()

AGE          0
SPHEQ        0
AL           0
ACD          0
LT           0
VCD          0
SPORTHR      0
READHR       0
COMPHR       0
STUDYHR      0
TVHR         0
DIOPTERHR    0
MOMMY        0
DADMY        0
MYOPIC       0
dtype: int64

# Part 1: Preparing The Data

In [4]:
# features
df_features = df.drop(columns="MYOPIC", axis=1)

# target
target = df['MYOPIC']

# fit Standard scaler
scaler = MinMaxScaler()
scaler.fit(df_features)

# transform train and test
features_scaled = scaler.transform(df_features)

# Part 2: Dimensionality Reduction

In [5]:
# PCA
pca = PCA(n_components=0.90) # preserving 90%

# fit with scaled xtrain data
feature_pca = pca.fit_transform(features_scaled)

# explained variance ratio
print(f'Explained Variance Ratio: {pca.explained_variance_ratio_}')

Explained Variance Ratio: [0.32040946 0.31759715 0.09261042 0.06636212 0.04495379 0.03569882
 0.0321611 ]


In [6]:
# TSNE

# intialize
tsne = TSNE(learning_rate=100, init='random')

# fit transform xtrain_pca
feature_tsne = tsne.fit_transform(feature_pca)

# Visualize Data

In [7]:
# add transformed features to df_features
df_features['x']  = feature_tsne[:,0]
df_features['y'] = feature_tsne[:,1]
df_features['class'] = df['MYOPIC']

In [26]:
# visualize data
fig = px.scatter(df_features, x='x', y='y', color='class')
fig.update_layout(
    title={
        'text': "t-SNE Distribution",
        'font': {'size': 24},
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    }
)

fig.show()


# Part 3: Perform a Cluster Analysis with K-means

In [9]:
df_pca = pd.DataFrame(feature_tsne, columns=["pc_1", "pc_2"])
df_pca.head()

Unnamed: 0,pc_1,pc_2
0,16.392826,-3.088469
1,5.104334,0.445038
2,8.307448,-42.904236
3,4.94313,39.823002
4,-35.215527,5.740087


In [23]:
# using a for loop to deterime best k value, and calculate the inertia for the range of k values


pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_features)

inertia = []

k = list(range(1, 10))

# loop
for i in k:
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(df_pca)
    inertia.append(kmeans.inertia_)
    
# ELBOW CURVE

# create dataframe for elbow curve
curve_data = {"k": k, "inertia": inertia}

df_curve = pd.DataFrame(curve_data)


# PLOT
fig = px.line(df_curve, x='k', y='inertia', title='Elbow Curve for K-means Clustering')
fig.update_layout(xaxis_title="K (number of clusters)", yaxis_title="Inertia")
fig.show()



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



Recommendation: The elbow curve above indicates patients can be clustered into 3 or 4 groups.
Based on the scatter plot, 5 clusters are distinct.