In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [3]:
filepath = "Resources/new_iris_data.csv"
df = pd.read_csv(filepath)
df

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2
...,...,...,...,...
145,6.7,5.2,3.0,2.3
146,6.3,5.0,2.5,1.9
147,6.5,5.2,3.0,2.0
148,6.2,5.4,3.4,2.3


In [6]:
#standardized data by scaling it (instantiating model and fitting data at one time)
iris_scaled = StandardScaler().fit_transform(df)
iris_scaled

array([[-9.00681170e-01, -1.34127240e+00,  1.03205722e+00,
        -1.31297673e+00],
       [-1.14301691e+00, -1.34127240e+00, -1.24957601e-01,
        -1.31297673e+00],
       [-1.38535265e+00, -1.39813811e+00,  3.37848329e-01,
        -1.31297673e+00],
       [-1.50652052e+00, -1.28440670e+00,  1.06445364e-01,
        -1.31297673e+00],
       [-1.02184904e+00, -1.34127240e+00,  1.26346019e+00,
        -1.31297673e+00],
       [-5.37177559e-01, -1.17067529e+00,  1.95766909e+00,
        -1.05003079e+00],
       [-1.50652052e+00, -1.34127240e+00,  8.00654259e-01,
        -1.18150376e+00],
       [-1.02184904e+00, -1.28440670e+00,  8.00654259e-01,
        -1.31297673e+00],
       [-1.74885626e+00, -1.34127240e+00, -3.56360566e-01,
        -1.31297673e+00],
       [-1.14301691e+00, -1.28440670e+00,  1.06445364e-01,
        -1.44444970e+00],
       [-5.37177559e-01, -1.28440670e+00,  1.49486315e+00,
        -1.31297673e+00],
       [-1.26418478e+00, -1.22754100e+00,  8.00654259e-01,
      

In [18]:
# Use PCA to reduce number of features

# Initialize a PCA model that will reduce number of features from 4 to 2
pca = PCA(n_components=2)
# Fit model with scaled data 
iris_pca = pca.fit_transform(iris_scaled)

#After this dimensionality reduction, we get a smaller set of dimensions called principal components. 
#These new components are just the two main dimensions of variation that contain most of the information in the original dataset.

df_iris_pca = pd.DataFrame(iris_pca, columns = ["principal component 1", "principal component 2"])
df_iris_pca



Unnamed: 0,principal component 1,principal component 2
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.367950,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767
...,...,...
145,1.870522,0.382822
146,1.558492,-0.905314
147,1.520845,0.266795
148,1.376391,1.016362


In [19]:
#find explained variance (how much information can be attributed to each principal component)
pca.explained_variance_ratio_

#the first principal component contains 72.77% of the variance and the second contains 23.03%. Together, they contain 95.80% of the information


array([0.72770452, 0.23030523])

In [20]:
# Find the best k value

inertia=[]
k=list(range(1,11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris_pca)
    inertia.append(km.inertia_)
    
elbow_data = pd.DataFrame({
    "K": k,
    "Inertia": inertia
})
    
# Plot k and inertia
elbow_data.hvplot.line(x="K", y="Inertia", xticks="Y", title="Elbow Curve")

#this plot shows that 3 is best

In [21]:
# Initialize KMeans model
model = KMeans(n_clusters=3)

#fit pca data to model
model.fit(df_iris_pca)

#predict
predictions = model.predict(df_iris_pca)

#add predicted column class labels
df_iris_pca["class"] = model.labels_
df_iris_pca

Unnamed: 0,principal component 1,principal component 2,class
0,-2.264542,0.505704,1
1,-2.086426,-0.655405,1
2,-2.367950,-0.318477,1
3,-2.304197,-0.575368,1
4,-2.388777,0.674767,1
...,...,...,...
145,1.870522,0.382822,0
146,1.558492,-0.905314,2
147,1.520845,0.266795,0
148,1.376391,1.016362,0


In [23]:
#plot clusters (2D because we only have 2 features now)
df_iris_pca.hvplot.scatter(x="principal component 1", y="principal component 2", hover_cols="class", by="class")