# Unsupervised Clustering Techniques

## SAMBHAV AGRAWAL 19264 DATA SCIENCE AND ENGINEERING

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

In [None]:
df = pd.read_csv('/content/data.csv', header = None)
df.columns = ['Feature1', 'Feature2']

df

In [None]:
plt.figure(figsize = (10,6))

ax = sns.scatterplot(data = df, x = 'Feature1', y = 'Feature2', s = 20)
ax.set_title('Points to be Clustered')

## Number of clusters is 2. (as given in the question)  But if we did not have the information about number of clusters, we could use Elbow plot to find number of clusters

In [None]:
scaler = StandardScaler()

dfs = scaler.fit_transform(df[['Feature1','Feature2']])

dfs = pd.DataFrame(dfs)

dfs.columns = ['Feature1','Feature2']

dfs.head()

In [None]:
from sklearn.cluster import KMeans

clusters = []

for i in range(1, 11):
    km = KMeans(n_clusters=i).fit(df)
    clusters.append(km.inertia_)
    
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=list(range(1, 11)), y=clusters, ax=ax)
ax.set_title('Searching for Elbow')
ax.set_xlabel('Clusters')
ax.set_ylabel('Inertia')

# Annotate arrow
ax.annotate('Possible Elbow Point', xy=(3, 14), xytext=(3, 5), xycoords='data',          
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2))

ax.annotate('Possible Elbow Point', xy=(5, 8), xytext=(5, 15), xycoords='data',          
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2))

plt.show()

## Different Types of Clustering Techniques

## 1) Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering, KMeans

km = KMeans(n_clusters = 2)
km.fit(df)

plt.scatter(df['Feature1'], df['Feature2'], c = km.predict(df))

In [None]:
clustering = SpectralClustering(n_clusters=2, assign_labels='discretize',
                                eigen_solver = 'lobpcg')
clustering.fit(df)

plt.scatter(df['Feature1'], df['Feature2'], c = km.predict(df))

In [None]:
clustering = SpectralClustering(n_clusters=2, assign_labels='discretize',
                                eigen_solver = 'arpack')
clustering.fit(df)

plt.scatter(df['Feature1'], df['Feature2'], c = km.predict(df))

## 2) Hierarchical (Agglomerative) Clustering


In [None]:
from sklearn.cluster import AgglomerativeClustering

clusters_Agglomerative = AgglomerativeClustering(n_clusters=2,affinity = 'euclidean' ,
                            linkage='ward').fit_predict(dfs)


In [None]:
plt.scatter(dfs['Feature1'], dfs['Feature2'], c=clusters_Agglomerative, s=60)

plt.show()

## Tuning the Parameters

In [None]:
clusters_Agglomerative = AgglomerativeClustering(n_clusters=2,affinity = 'l1' ,
                            linkage='complete').fit_predict(df)


In [None]:
plt.scatter(dfs['Feature1'], dfs['Feature2'], c=clusters_Agglomerative, s=60)

plt.show()

In [None]:
clusters_Agglomerative = AgglomerativeClustering(n_clusters=2,affinity = 'l1' ,
                            linkage='average').fit_predict(df)


In [None]:
plt.scatter(dfs['Feature1'], dfs['Feature2'], c=clusters_Agglomerative, s=60)

plt.show()

## 4) DBSCAN

In [None]:
db = DBSCAN(eps = 0.3088, min_samples = 81).fit(dfs)

labels = db.labels_

print('Labels:', np.unique(labels), '\n')

print('Outliers:', labels.tolist().count(-1), '\n')

fig, ax = plt.subplots(figsize = (10,6))

plt.title('Clusters')

sns.scatterplot(dfs.Feature1,dfs.Feature2, hue = [f'cluster : {i}' for i in labels])

plt.show()

Here the green colored dots is containing the noise

## kDistance Graph

In [None]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(dfs[['Feature1','Feature2']])
distances, indices = nbrs.kneighbors(df[['Feature1','Feature2']])

# The distance variable contains an array of distances between a data point and 
# its nearest data point for all data points in the dataset.

# Let’s plot our K-distance graph and find the value of epsilon. Use the following syntax:

# Plotting K-distance Graph
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure(figsize=(8,4))
plt.plot(distances)
plt.title('K-distance Graph',fontsize=10)
plt.xlabel('Data Points sorted by distance',fontsize=10)
plt.ylabel('Epsilon',fontsize=10)
plt.show()

## 5) kMeans

In [None]:
kms = KMeans(n_clusters=8, random_state=1).fit(df[['Feature1', 'Feature2']])
df['Cluster'] = kms.labels_

df

In [None]:
plt.figure(figsize=(10, 6))

ax = sns.scatterplot(data=df, x="Feature1", y="Feature2", hue='Cluster', 
                     s=20,  legend=True, palette='tab10')


plt.legend(loc='lower right', title='Cluster')
ax.set_title("Final Clustered Points")

In [None]:
new_df = df  


new_df['Cluster'] = new_df['Cluster'].replace([7],8)
new_df['Cluster'] = new_df['Cluster'].replace([2],8)
new_df['Cluster'] = new_df['Cluster'].replace([5],8)
new_df['Cluster'] = new_df['Cluster'].replace([3],8)

new_df['Cluster'] = new_df['Cluster'].replace([0],9)
new_df['Cluster'] = new_df['Cluster'].replace([1],9)
new_df['Cluster'] = new_df['Cluster'].replace([4],9)
new_df['Cluster'] = new_df['Cluster'].replace([6],9)

new_df['Cluster'] = new_df['Cluster'].replace([8],0)

new_df['Cluster'] = new_df['Cluster'].replace([9],1)

In [None]:

plt.figure(figsize=(10, 6))

ax = sns.scatterplot(data=new_df, x="Feature1", y="Feature2", hue='Cluster', 
                     s=20,  legend=True, palette='tab10')



plt.legend(loc='lower right', title='Clusters')
ax.set_title("Clustered Points for n_clusters equal to 8")

In [None]:
new_df['Cluster'].mean()

In [None]:
new_df['Cluster'].to_csv("labels.csv",header = None,index = None)



In [None]:
df_t=pd.read_csv("labels.txt",header=None)
df_t

In [None]:
df_t.mean()

In [None]:
kms = KMeans(n_clusters=10, random_state=1).fit(df[['Feature1', 'Feature2']])
df['Cluster'] = kms.labels_

df

In [None]:
plt.figure(figsize=(10, 6))

ax = sns.scatterplot(data=df, x="Feature1", y="Feature2", hue='Cluster', 
                     s=20,  legend=True, palette='tab10')


plt.legend(loc='lower right', title='Cluster')
ax.set_title("Final Clustered Points")

In [None]:
new_df = df  


new_df['Cluster'] = new_df['Cluster'].replace([4],10)
new_df['Cluster'] = new_df['Cluster'].replace([9],10)
new_df['Cluster'] = new_df['Cluster'].replace([6],10)
new_df['Cluster'] = new_df['Cluster'].replace([8],10)
new_df['Cluster'] = new_df['Cluster'].replace([2],10)

new_df['Cluster'] = new_df['Cluster'].replace([0],1)
new_df['Cluster'] = new_df['Cluster'].replace([1],1)
new_df['Cluster'] = new_df['Cluster'].replace([7],1)
new_df['Cluster'] = new_df['Cluster'].replace([3],1)
new_df['Cluster'] = new_df['Cluster'].replace([5],1)

new_df['Cluster'] = new_df['Cluster'].replace([10],0)

In [None]:
new_df.head()

In [None]:

plt.figure(figsize=(10, 6))

ax = sns.scatterplot(data=new_df, x="Feature1", y="Feature2", hue='Cluster', 
                     s=20,  legend=True, palette='tab10')



plt.legend(loc='lower right', title='Clusters')
ax.set_title("Clustered Points")

In [None]:
new_df['Cluster'].mean()

As we can see that by using kMeans, we are able to separate the 2 clusters given. 

In [None]:
new_df['Cluster'].to_csv("labels.txt",index = None)

In [None]:
df_t=pd.read_csv("labels.txt",header=None)
df_t.shape

For a complete 50-50 distribution in the points in the cluster it should average out to 0.5 because 1500 points are in each of the clusters and our is very close to 50% that is 0.51 approx.

## We got mean of the cluster label values as follows:

For n_clusters = 8 in kMeans Mean came out to be 0.5063333333333333 
For n_clusters = 10 in kMeans Mean came out to be 0.5136666666666667

Since for n = 8 the value is more closer to half , so the kMeans performs better with value of n_clusters = 8

## I already saved the labels file above for kMeans with n_clusters = 8.

In [None]:
df_t.head()

In [None]:
df_t.describe()