In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data

--2021-04-25 12:44:10--  https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4551 (4.4K) [application/x-httpd-php]
Saving to: ‘iris.data’


2021-04-25 12:44:10 (84.5 MB/s) - ‘iris.data’ saved [4551/4551]

--2021-04-25 12:44:10--  https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10782 (11K) [application/x-httpd-php]
Saving to: ‘wine.data’


2021-04-25 12:44:11 (89.5 MB/s) - ‘wine.data’ saved [10782/10782]

--2021-04-25 12:44:11--  https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data


In [3]:
from scipy.spatial import distance
class Particle:
  def __init__(self, n_clusters, data):
    self.cur_pos = data[np.random.choice(list(range(len(data))), n_clusters)]
    self.pb_pos = self.cur_pos.copy()
    self.velocity = np.zeros_like(self.cur_pos)
    self.pb_val = np.inf

def initParticles(data, n_clusters, n_particles):
  particles = []
  for i in range(n_particles):
    particles.append(Particle(n_clusters, data))
  return particles

def getDistances(data, particle):
  distances = []
  for centroid in particle.cur_pos:
    d = np.linalg.norm(data - centroid, axis = 1)
    distances.append(d)
  distances = np.array(distances)
  return distances

def fitnessFunction(clusters, n_clusters, distances):
  J = 0.0
  for i in range(n_clusters):
    p = np.where(clusters == i)[0]
    if len(p):
      d = sum(distances[i][p])
      d /= len(p)
      J += d
  J /= n_clusters
  return J

In [4]:
def updatePbAndGb(data, particle, n_clusters, gb_val, gb_pos, gb_clusters):
  distances = getDistances(data, particle)
  clusters = np.argmin(distances, axis=0)  # shape: (len(data),)
  clusters_ids = np.unique(clusters)

  while len(clusters_ids) != n_clusters:
    deleted_clusters = np.where(np.isin(np.arange(n_clusters), clusters_ids) == False)[0]
    particle.cur_pos[deleted_clusters] = data[np.random.choice(list(range(len(data))), len(deleted_clusters))]
    distances = getDistances(data, particle)
    clusters = np.argmin(distances, axis=0)
    clusters_ids = np.unique(clusters)
  
  fitnessVal = fitnessFunction(clusters, n_clusters, distances)
  if fitnessVal < particle.pb_val:
    particle.pb_val = fitnessVal
    particle.pb_pos = particle.cur_pos.copy()
  
  if particle.pb_val < gb_val:
    gb_val = particle.pb_val
    gb_pos = particle.pb_pos
    gb_clusters = clusters.copy()
  return gb_val, gb_pos, gb_clusters

def updateVelocity(particle, gb_pos, w=0.72, c1=1.49, c2=1.49):
  previousVelFrac = w * particle.velocity
  cognitiveComp = c1 * np.random.random() * (particle.pb_pos - particle.cur_pos)
  socialComp = c2 * np.random.random() * (gb_pos - particle.cur_pos)
  particle.velocity = previousVelFrac + cognitiveComp + socialComp

def updateCentroids(particle, gb_pos):
  updateVelocity(particle, gb_pos)
  particle.cur_pos = particle.cur_pos + particle.velocity

In [5]:
# Get Clustering given by gbest PSO algorithm
def gbestPSOClustering(n_iters, data, n_clusters, n_particles, w=0.72, c1=1.49, c2=1.49): 
  
  particles = initParticles(data, n_clusters, n_particles)
  
  gb_val = np.inf
  gb_pos = None
  gb_clusters = None

  for i in range(n_iters):
    for particle in particles:
      gb_val, gb_pos, gb_clusters = updatePbAndGb(data, particle, n_clusters, gb_val, gb_pos, gb_clusters)
  
    for particle in particles:
      updateCentroids(particle, gb_pos)
  return gb_clusters, gb_pos, gb_val

def getIntraClusterDistance(data):
  data = data[~np.isnan(data)]
  d = 0
  ln = len(data)
  if ln <= 1:
    return 0
  total = (ln * (ln - 1)) // 2
  for i in range(ln):
    for j in range(ln):
      if j <= i:
        continue
      d += distance.euclidean(data[j], data[i])
  d = d / total
  return d

In [6]:
data_points = pd.read_csv('iris.data', sep=',', header=None)
clusters, centroids, quantError = gbestPSOClustering(10000, data_points.iloc[:, :4].values, 3, 10)

In [7]:
print("Quantization Error : ", quantError)

Quantization Error :  0.5308110415892776


In [8]:
class0 = []
class1 = []
class2 = []
for i in range(len(clusters)):
  if clusters[i] == 0:
    class0.append(i)
  elif clusters[i] == 1:
    class1.append(i)
  elif clusters[i] == 2:
    class2.append(i)
dis0 = getIntraClusterDistance(data_points.iloc[class0].iloc[:, :4].values)
dis1 = getIntraClusterDistance(data_points.iloc[class1].iloc[:, :4].values)
dis2 = getIntraClusterDistance(data_points.iloc[class2].iloc[:, :4].values)
print("Intra Cluster Distances")
print("Class 0 : ", dis0)
print("Class 1 : ", dis1)
print("Class 2 : ", dis2)

Intra Cluster Distances
Class 0 :  2.107974874371836
Class 1 :  2.266666666666667
Class 2 :  2.1516289477049693


In [9]:
print("Inter Cluster Distances")
for i in range(len(centroids)):
  for j in range(len(centroids)):
    if j <= i:
      continue
    print("Class ", i, "-", j, " : ", distance.euclidean(centroids[i], centroids[j]))

Inter Cluster Distances
Class  0 - 1  :  1.0548223105810384
Class  0 - 2  :  3.8624245942857196
Class  1 - 2  :  4.15130645980796


In [10]:
data_points = pd.read_csv('wine.data', sep=',', header=None)
clusters, centroids, quantError = gbestPSOClustering(1000, data_points[[1, 2, 3]].values, 3, 10)

In [11]:
print("Quantization Error : ", quantError)

Quantization Error :  0.706252344223874


In [12]:
class0 = []
class1 = []
class2 = []
for i in range(len(clusters)):
  if clusters[i] == 0:
    class0.append(i)
  elif clusters[i] == 1:
    class1.append(i)
  elif clusters[i] == 2:
    class2.append(i)
dis0 = getIntraClusterDistance(data_points.iloc[class0].iloc[:, 1:].values)
dis1 = getIntraClusterDistance(data_points.iloc[class1].iloc[:, 1:].values)
dis2 = getIntraClusterDistance(data_points.iloc[class2].iloc[:, 1:].values)
print("Intra Cluster Distances")
print("Class 0 : ", dis0)
print("Class 1 : ", dis1)
print("Class 2 : ", dis2)

Intra Cluster Distances
Class 0 :  104.49960609510711
Class 1 :  170.86538864780604
Class 2 :  99.48179393699853


In [13]:
print("Inter Cluster Distances")
for i in range(len(centroids)):
  for j in range(len(centroids)):
    if j <= i:
      continue
    print("Class ", i, "-", j, " : ", distance.euclidean(centroids[i], centroids[j]))

Inter Cluster Distances
Class  0 - 1  :  1.7164659290589859
Class  0 - 2  :  1.9877125522722632
Class  1 - 2  :  1.480726265293889


In [14]:
data_points = pd.read_csv('breast-cancer.data', sep=',', header=None)
data_points.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [15]:
# replace 2nd, 4rd, 5th column with the avg. values
data_points[1] = data_points[1].str.split('-').apply(lambda x: (float(x[0]) + float(x[1]))/2)
data_points[3] = data_points[3].str.split('-').apply(lambda x: (float(x[0]) + float(x[1]))/2)
data_points[4] = data_points[4].str.split('-').apply(lambda x: (float(x[0]) + float(x[1]))/2)

# map columns with string with appropriate numbers
map2 = {'lt40' : 0, 'ge40' : 1, 'premeno' : 2}
data_points[2] = data_points[2].map(map2)

map5_9 = {'yes' : 1, 'no' : 0}
data_points[5] = data_points[5].map(map5_9)
data_points[9] = data_points[9].map(map5_9)

map7 = {'left' : 0, 'right' : 1}
data_points[7] = data_points[7].map(map7)

map8 = {'left_up' : 0, 'left_low' : 1, 'right_up' : 2, 'right_low' : 3, 'central' : 4}
data_points[8] = data_points[8].map(map8)

data_points.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,no-recurrence-events,34.5,2,32.0,1.0,0.0,3,0,1.0,0
1,no-recurrence-events,44.5,2,22.0,1.0,0.0,2,1,2.0,0
2,no-recurrence-events,44.5,2,22.0,1.0,0.0,2,0,1.0,0
3,no-recurrence-events,64.5,1,17.0,1.0,0.0,2,1,0.0,0
4,no-recurrence-events,44.5,2,2.0,1.0,0.0,2,1,3.0,0


In [16]:
clusters, centroids, quantError = gbestPSOClustering(1000, data_points[[1, 2, 3]].values, 3, 10)

In [17]:
print("Quantization Error : ", quantError)

Quantization Error :  8.486893901417854


In [18]:
class0 = []
class1 = []
for i in range(len(clusters)):
  if clusters[i] == 0:
    class0.append(i)
  elif clusters[i] == 1:
    class1.append(i)
dis0 = getIntraClusterDistance(data_points.iloc[class0].iloc[:, 1:].values)
dis1 = getIntraClusterDistance(data_points.iloc[class1].iloc[:, 1:].values)
print("Intra Cluster Distances")
print("Class 0 : ", dis0)
print("Class 1 : ", dis1)

Intra Cluster Distances
Class 0 :  15.709882937946752
Class 1 :  13.642235914318126


In [19]:
print("Inter Cluster Distances")
for i in range(len(centroids)):
  for j in range(len(centroids)):
    if j <= i:
      continue
    print("Class ", i, "-", j, " : ", distance.euclidean(centroids[i], centroids[j]))

Inter Cluster Distances
Class  0 - 1  :  10.006654701515737
Class  0 - 2  :  22.844933151952347
Class  1 - 2  :  14.207853171355922
