In [32]:
#Load required packages
import pandas as pd 
from scipy import stats 
from sklearn.cluster import KMeans

In [33]:
#Read in the white_wine_training  data set as wine_train
filePath = "../datasets/white_wine_training"
wine_train = pd.read_csv(filePath)
wine_train.head(5)

Unnamed: 0,alcohol,quality,sugar
0,8.4,4,5.9
1,8.5,6,5.9
2,8.5,6,18.0
3,8.5,6,18.0
4,8.5,5,9.1


In [34]:
# Isolate predictor vaiables
X = wine_train[['alcohol', 'sugar']]

# Standardize X using z-score    
#the stats.zscore  command will convert the variables in X  into their z‐scores.      
Xz = pd.DataFrame(stats.zscore(X), columns=["alcohol", "sugar"])
Xz.head()

Unnamed: 0,alcohol,sugar
0,-1.826971,0.011679
1,-1.743463,0.011679
2,-1.743463,2.601695
3,-1.743463,2.601695
4,-1.743463,0.696642


In [35]:
#run k‐means clustering on the training data set.
kmeans01 = KMeans(n_clusters = 2).fit(Xz)
# Save cluster membership to investigate the clustering results
cluster = kmeans01.labels_
# Separate records in two groups based on cluster membership
Cluster1 = Xz.loc[cluster == 0] 
Cluster2 = Xz.loc[cluster == 1]

In [36]:
Cluster1.describe() 

Unnamed: 0,alcohol,sugar
count,1094.0,1094.0
mean,0.494675,-0.623812
std,0.902787,0.47604
min,-1.576448,-1.122791
25%,-0.156821,-0.951551
50%,0.427732,-0.844525
75%,1.179299,-0.352208
max,2.891203,1.477928


In [37]:
Cluster2.describe()

Unnamed: 0,alcohol,sugar
count,715.0,715.0
mean,-0.756887,0.954476
std,0.580815,0.823506
min,-1.826971,-0.972956
25%,-1.158911,0.35416
50%,-0.908388,0.867883
75%,-0.407343,1.48863
max,2.014374,5.512788


#### To validate the clustering result, we run the k-means on the test set

In [38]:
filePath = "../datasets/white_wine_test"
wine_test = pd.read_csv(filePath)

X_test = wine_test[["alcohol", "sugar"]] 
Xz_test = pd.DataFrame(stats.zscore(X_test), columns=["alcohol", "sugar"])

In [39]:
kmeans_test = KMeans(n_clusters = 2).fit(Xz_test) 
cluster_test = kmeans_test.labels_ # Cluster membership 
Cluster1_test = Xz_test.loc[cluster_test == 0] 
Cluster2_test = Xz_test.loc[cluster_test == 1] 
Cluster1_test.describe() 

Unnamed: 0,alcohol,sugar
count,1120.0,1120.0
mean,0.457458,-0.60731
std,0.903744,0.458724
min,-1.675754,-1.089453
25%,-0.218729,-0.945241
50%,0.395111,-0.821632
75%,1.157351,-0.293714
max,2.776268,1.423949


In [40]:
Cluster2_test.describe()

Unnamed: 0,alcohol,sugar
count,640.0,640.0
mean,-0.800552,1.062792
std,0.561557,0.779781
min,-2.080483,-1.037949
25%,-1.190079,0.393866
50%,-0.947241,1.032518
75%,-0.542512,1.573311
max,1.56208,3.2987


Source: Data Science Using Python and R
Author: Chantal D. Larose, Daniel T. Larose
Date: 2019
CHAPTER 10 CLUSTERING

https://eds.p.ebscohost.com/eds/ebookviewer/ebook/bmxlYmtfXzIwOTEzNzFfX0FO0?sid=5e1cab63-4b57-49a8-8824-a6913b37cfc3@redis&vid=0&format=EB&rid=1

