In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn import metrics
import scipy
from sklearn.datasets import load_wine

##Problem 1


In [2]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
auto = pd.read_csv(url, delimiter=r"\s+")
auto.columns = ['mpg','cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name' ]
auto = auto.drop(columns = ['car_name', 'cylinders', 'model_year'])

In [3]:
auto.head(5)

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin
0,15.0,350.0,165.0,3693.0,11.5,1
1,18.0,318.0,150.0,3436.0,11.0,1
2,16.0,304.0,150.0,3433.0,12.0,1
3,17.0,302.0,140.0,3449.0,10.5,1
4,15.0,429.0,198.0,4341.0,10.0,1


In [4]:
auto.isna().values.any()

False

In [5]:
auto.isin(['?']).sum()

mpg             0
displacement    0
horsepower      6
weight          0
acceleration    0
origin          0
dtype: int64

In [6]:
#Replacing '?' with NaN from NumPy in column 'horsepower'
imp = SimpleImputer(strategy = 'constant', fill_value=np.nan, missing_values='?')
imp = imp.fit(auto[['horsepower']])
auto['horsepower'] = imp.transform(auto[['horsepower']])

In [7]:
#Replacing NaN with mean from NumPy in column 'horsepower'
imp = SimpleImputer(strategy = 'mean', missing_values=np.nan)
imp = imp.fit(auto[['horsepower']])
auto['horsepower'] = imp.transform(auto[['horsepower']])

In [8]:
auto.isin(['?']).sum()

mpg             0
displacement    0
horsepower      0
weight          0
acceleration    0
origin          0
dtype: int64

In [9]:
X, y = auto.iloc[:, 0:-1], auto['origin']

In [10]:
X = StandardScaler().fit_transform(X)

In [11]:
avg_cluster = AgglomerativeClustering(n_clusters=3, linkage='average', affinity='euclidean', compute_full_tree=False)

In [12]:
avg_lables = avg_cluster.fit_predict(X, y)

In [13]:
y.unique()

array([1, 3, 2])

In [14]:
class_labels = y.astype('int32') - 1

In [15]:
auto.iloc[:, 0:-1] = StandardScaler().fit_transform(auto.iloc[:, 0:-1])
auto['cluster'] = avg_lables
auto['class'] = class_labels

In [16]:
auto.head(5)

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cluster,class
0,-1.091843,1.506627,1.58721,0.855275,-1.481575,1,1,0
1,-0.707773,1.19927,1.194309,0.551642,-1.663271,1,1,0
2,-0.96382,1.064801,1.194309,0.548098,-1.299879,1,1,0
3,-0.835796,1.045591,0.932376,0.567001,-1.844967,1,1,0
4,-1.091843,2.265414,2.45159,1.620855,-2.026662,1,1,0


In [17]:
C0 = auto.loc[auto['cluster'] == 0]
C0.describe()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cluster,class
count,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0
mean,0.339131,-0.469056,-0.469239,-0.437924,0.308342,1.754209,0.0,0.754209
std,0.822777,0.569165,0.450056,0.646156,0.802365,0.848403,0.0,0.848403
min,-1.091843,-1.201955,-1.529799,-1.602142,-1.554253,1.0,0.0,0.0
25%,-0.323703,-0.918611,-0.848772,-0.97361,-0.246044,1.0,0.0,0.0
50%,0.29081,-0.692896,-0.429678,-0.504575,0.190025,1.0,0.0,0.0
75%,0.956532,-0.097392,-0.115358,-0.004821,0.807791,3.0,0.0,2.0
max,2.953697,1.506627,0.749022,1.288867,3.351531,3.0,0.0,2.0


In [18]:
L0 = auto.loc[auto['class'] == 0]
L0.describe()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cluster,class
count,248.0,248.0,248.0,248.0,248.0,248.0,248.0,248.0
mean,-0.439956,0.504406,0.376255,0.463457,-0.193001,1.0,0.387097,0.0
std,0.821202,0.947275,1.039651,0.940846,0.999283,0.0,0.488071,0.0
min,-1.859984,-1.038672,-1.372638,-1.381211,-2.753445,1.0,0.0,0.0
25%,-1.091843,-0.404749,-0.429678,-0.300184,-0.936488,1.0,0.0,0.0
50%,-0.63736,0.546137,0.015609,0.464806,-0.209705,1.0,0.0,0.0
75%,0.069969,1.19927,1.194309,1.282074,0.489823,1.0,1.0,0.0
max,1.980719,2.515141,3.289777,2.564834,2.406713,1.0,1.0,0.0


In [19]:
C1 = auto.loc[auto['cluster'] == 1]
C1.describe()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cluster,class
count,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0
mean,-1.156788,1.491719,1.512177,1.395949,-1.064432,1.0,1.0,0.0
std,0.277345,0.43948,0.678207,0.517054,0.651997,0.0,0.0,0.0
min,-1.859984,0.363643,0.408509,0.138134,-2.753445,1.0,1.0,0.0
25%,-1.34789,1.093616,1.063343,1.037809,-1.481575,1.0,1.0,0.0
50%,-1.219867,1.506627,1.194309,1.427688,-0.936488,1.0,1.0,0.0
75%,-0.96382,1.823588,1.849143,1.724527,-0.663944,1.0,1.0,0.0
max,-0.426121,2.515141,3.289777,2.564834,1.062165,1.0,1.0,0.0


In [20]:
L1 = auto.loc[auto['class'] == 1]
L1.describe()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cluster,class
count,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0
mean,0.558562,-0.806782,-0.606742,-0.644813,0.439727,2.0,0.114286,1.0
std,0.860821,0.216899,0.530745,0.578962,1.106776,0.0,0.467583,0.0
min,-0.938215,-1.201955,-1.529799,-1.351675,-1.227201,2.0,0.0,1.0
25%,0.060368,-0.969036,-0.901158,-1.065468,-0.391401,2.0,0.0,1.0
50%,0.380426,-0.851376,-0.704708,-0.861373,0.044669,2.0,0.0,1.0
75%,0.911724,-0.692896,-0.357646,-0.235499,1.207521,2.0,0.0,1.0
max,2.659243,-0.097392,0.749022,1.005319,3.351531,2.0,2.0,1.0


In [21]:
C2 = auto.loc[auto['cluster'] == 2]
C2.describe()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cluster,class
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,2.582429,-0.973839,-1.451219,-0.986902,2.652002,2.0,2.0,1.0
std,0.070121,0.033617,0.052387,0.17393,0.552207,0.0,0.0,0.0
min,2.505615,-0.990647,-1.477412,-1.162643,2.152339,2.0,2.0,1.0
25%,2.534421,-0.990647,-1.477412,-1.074034,2.206848,2.0,2.0,1.0
50%,2.582429,-0.990647,-1.477412,-1.017915,2.588409,2.0,2.0,1.0
75%,2.630438,-0.973839,-1.451219,-0.930783,3.033563,2.0,2.0,1.0
max,2.659243,-0.923413,-1.372638,-0.749135,3.278852,2.0,2.0,1.0


In [22]:
L2 = auto.loc[auto['class'] == 2]
L2.describe()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cluster,class
count,79.0,79.0,79.0,79.0,79.0,79.0,79.0,79.0
mean,0.8862,-0.86858,-0.643535,-0.883551,0.216245,3.0,0.0,2.0
std,0.779669,0.222259,0.466744,0.378652,0.710407,0.0,0.0,0.0
min,-0.707773,-1.182745,-1.372638,-1.602142,-1.517914,3.0,0.0,2.0
25%,0.278008,-1.029067,-0.979738,-1.162643,-0.355062,3.0,0.0,2.0
50%,1.033346,-0.923413,-0.770192,-0.961796,0.299043,3.0,0.0,2.0
75%,1.347003,-0.712105,-0.246325,-0.657572,0.716943,3.0,0.0,2.0
max,2.953697,-0.241465,0.722829,-0.046172,1.970643,3.0,0.0,2.0


There is no clear relationship between cluster assignment and class labels.


## Problem 2


In [23]:
dataset = load_boston()
boston = pd.DataFrame(data=dataset.data,
                  columns=dataset.feature_names)
boston_scaled = pd.DataFrame(data=preprocessing.scale(dataset.data),
                         columns=dataset.feature_names)

In [32]:
for i in range(2,7):
  clust_model = KMeans(n_clusters=i,
                     init='k-means++')
  clust_labels = clust_model.fit_predict(boston_scaled)
  silhouette_avg = metrics.silhouette_score(boston_scaled,
                                          clust_labels)
  print("For K =", i, "the Silhouette score is: ", silhouette_avg)

For K = 2 the Silhouette score is:  0.38694918786902555
For K = 3 the Silhouette score is:  0.2936276161771098
For K = 4 the Silhouette score is:  0.2830326613627112
For K = 5 the Silhouette score is:  0.30706446362603446
For K = 6 the Silhouette score is:  0.30321550900048433


The strongest Silhouette score is for k = 2, hence the number of 2 clusters is optimal.


In [33]:
clust_model = KMeans(n_clusters=2,
                     init='k-means++')
clust_model = clust_model.fit(boston_scaled)
clust_labels = clust_model.fit_predict(boston_scaled)

In [34]:
boston_scaled['cluster'] = clust_labels

In [43]:
i=0
print("\nMean Values for Cluster", i, ":")
print(boston_scaled.loc[boston_scaled['cluster'] == i].mean())
print("\nCluster Centroid for Cluster", i, ":")
for j in range(len(clust_model.cluster_centers_[i])):
  print(boston_scaled.columns[j]," ",clust_model.cluster_centers_[i][j])


Mean Values for Cluster 0 :
CRIM      -0.387086
ZN         0.244585
INDUS     -0.568511
CHAS      -0.003628
NOX       -0.562325
RM         0.226847
AGE       -0.403960
DIS        0.428455
RAD       -0.585428
TAX       -0.628576
PTRATIO   -0.302134
B          0.308279
LSTAT     -0.427835
cluster    0.000000
dtype: float64

Cluster Centroid for Cluster 0 :
CRIM   -0.3870856530541483
ZN   0.24458480602151983
INDUS   -0.5685114795069619
CHAS   -0.0036284845298841803
NOX   -0.562325424116676
RM   0.2268471389364735
AGE   -0.4039598342002781
DIS   0.4284554440110399
RAD   -0.585427987845243
TAX   -0.6285756497572984
PTRATIO   -0.3021337511469771
B   0.3082794092263455
LSTAT   -0.4278346416001986
cluster   0.296735905044507


In [44]:
i=1
print("\nMean Values for Cluster", i, ":")
print(boston_scaled.loc[boston_scaled['cluster'] == i].mean())
print("\nCluster Centroid for Cluster", i, ":")
for j in range(len(clust_model.cluster_centers_[i])):
  print(boston_scaled.columns[j]," ",clust_model.cluster_centers_[i][j])


Mean Values for Cluster 1 :
CRIM       0.771881
ZN        -0.487722
INDUS      1.133659
CHAS       0.007235
NOX        1.121323
RM        -0.452352
AGE        0.805529
DIS       -0.854376
RAD        1.167392
TAX        1.253432
PTRATIO    0.602480
B         -0.614735
LSTAT      0.853138
cluster    1.000000
dtype: float64

Cluster Centroid for Cluster 1 :
CRIM   0.7718808584570883
ZN   -0.48772236467013313
INDUS   1.133658985762395
CHAS   0.007235498737104778
NOX   1.1213234788598876
RM   -0.45235198711004343
AGE   0.8055293735236276
DIS   -0.8543756487084024
RAD   1.1673919047564931
TAX   1.25343191697165
PTRATIO   0.602479728618496
B   -0.6147346799365319
LSTAT   0.8531377172737665
cluster   1.9289940828402345


The centroid coordinates and mean values are identical.

## Problem 3


In [29]:
dataset = load_wine()
wine = pd.DataFrame(data=dataset.data,
                  columns=dataset.feature_names)
wine_scaled = pd.DataFrame(data=preprocessing.scale(dataset.data),
                         columns=dataset.feature_names)

In [30]:
wine_model = KMeans(n_clusters=3,
                     init='k-means++')
wine_labels = wine_model.fit_predict(wine_scaled)

In [31]:
h_score = metrics.homogeneity_score(dataset.target,
                                    wine_labels)
c_score = metrics.completeness_score(dataset.target,
                                     wine_labels)
print("h_score:", h_score)
print("c_score:", c_score)

h_score: 0.8788432003662368
c_score: 0.8729636016078732


A clustering result satisfies **homogeneity** if all of its clusters contain only data points which are members of a single class. A clustering result satisfies **completeness** if all the data points that are members of a given class are elements of the same cluster. Given the homogeneity score of 0.88 and completeness score of 0.87, we can conclude that 88% of the data points in a cluster are data points which are members of a single class, and 87% of the the data points that are members of a given class are elements of the same cluster. These are very good results. 