In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
%matplotlib inline

In [None]:
X,y = make_blobs(n_samples = 1000,n_features = 2,centers = 3,random_state = 23)



In [None]:
fig = plt.figure(0)
plt.grid(True)
plt.scatter(X[:,0],X[:,1])
plt.show()

In [None]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.cluster import KMeans

In [None]:
# manual process
# elbow method  to select the k value

wcss=[]
for k in range(1,11):
    kmeans=KMeans(n_clusters=k,init='k-means++')
    kmeans.fit(X_train)
    wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
plt.plot(range(1,11),wcss)
plt.xticks(range(1,11))
plt.xlabel('number of clusterers')
plt.ylabel('wcss')
plt.show()

In [None]:
kmeans=KMeans(n_clusters=3,init='k-means++')

In [None]:
kmeans.fit_predict(X_train)

In [None]:
y_labels=kmeans.fit_predict(X_train)

In [None]:
plt.scatter(X_train[:,0],X_train[:,1],c=y_labels)

In [None]:
# knee locator
from kneed import KneeLocator

In [None]:
kl= KneeLocator(range(1,11),wcss,curve='convex',direction='decreasing')
kl.elbow

In [None]:
# performance metrics
# silhouette score

from sklearn.metrics import silhouette_score

In [None]:
silhouette_coefficients=[]
for k in range(2,11):
    kmeans=KMeans(n_clusters=k,init='k-means++')
    kmeans.fit(X_train)
    score=silhouette_score(X_train,kmeans.labels_)
    silhouette_coefficients.append(score)

In [None]:
silhouette_coefficients

In [None]:
plt.plot(range(2,11),silhouette_coefficients)
plt.xticks(range(2,11))
plt.xlabel('number of clusterers')
plt.ylabel('silhouette_coefficients')
plt.show()

Hierarichal Clustering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [None]:
iris=datasets.load_iris()

In [None]:
iris_data=pd.DataFrame(iris.data)

In [None]:
iris_data.columns=iris.feature_names

In [None]:
iris_data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_scaled = scaler.fit_transform(iris_data)

In [None]:
X_scaled

In [None]:
X_scaled.shape

Dimensionality Reduction 

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca=PCA(n_components=2)

In [None]:
pca

In [None]:
pca_scaled = pca.fit_transform(X_scaled)

In [None]:
pca_scaled

In [None]:
plt.scatter(pca_scaled[:,0],pca_scaled[:,1])

In [None]:
# Agglomerative Clustering
# to construct a dendogram

import scipy.cluster.hierarchy as sc

# plot dendogram
plt.figure(figsize=(20,7))
plt.title('Dendograms')

# create dendogram
sc.dendrogram(sc.linkage(pca_scaled,method='ward'))
plt.title('Dendogram')
plt.xlabel('Sample Index')
plt.ylabel('Eucledian Distance')

In [None]:
from sklearn.cluster import AgglomerativeClustering

cluster=AgglomerativeClustering(n_clusters=2,affinity='euclidean',linkage='ward')
cluster.fit(pca_scaled)

In [None]:
cluster.labels_

In [None]:
plt.scatter(pca_scaled[:,0],pca_scaled[:,1],c=cluster.labels_)

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
silhouette_coefficients=[]

for k in range(2,11):
    agglo=AgglomerativeClustering(n_clusters=k,affinity='euclidean',linkage='ward')
    agglo.fit(X_scaled)
    score=silhouette_score(X_scaled,agglo.labels_)
    silhouette_coefficients.append(score)
    

In [None]:
plt.plot(range(2,11),silhouette_coefficients)
plt.xticks(range(2,11))
plt.xlabel('number of clusterers')
plt.ylabel('silhouette_coefficients')
plt.show()

DBSCAN Clustering

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
X,y=make_moons(n_samples=250,noise=0.05)

In [None]:
X

In [None]:
plt.scatter(X[:,0],X[:,1])

In [None]:
# feature scaling (standard scaling)

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
X_scaled=scaler.fit_transform(X)

In [None]:
# DBSCAN Algorithm
from sklearn.cluster import DBSCAN
dbscan=DBSCAN(eps=0.5)

In [None]:
dbscan.fit(X_scaled)

In [None]:
dbscan.labels_

In [None]:
plt.scatter(X[:,0],X[:,1],c=dbscan.labels_)

In [None]:
plt.scatter(X[:,0],X[:,1],c=y)

PCA

In [None]:
import numpy as np
import pandas as pd

np.random.seed(23) 

mu_vec1 = np.array([0,0,0])
cov_mat1 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class1_sample = np.random.multivariate_normal(mu_vec1, cov_mat1, 20)

df = pd.DataFrame(class1_sample,columns=['feature1','feature2','feature3'])
df['target'] = 1

mu_vec2 = np.array([1,1,1])
cov_mat2 = np.array([[1,0,0],[0,1,0],[0,0,1]])
class2_sample = np.random.multivariate_normal(mu_vec2, cov_mat2, 20)

df1 = pd.DataFrame(class2_sample,columns=['feature1','feature2','feature3'])

df1['target'] = 0

df = df.append(df1,ignore_index=True)

df = df.sample(40)

In [None]:
df.head()

In [None]:
# create 3d plot

import plotly.express as px
#y_train_trf = y_train.astype(str)
fig = px.scatter_3d(df, x=df['feature1'], y=df['feature2'], z=df['feature3'],
              color=df['target'].astype('str'))
fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show()

In [None]:

# Step 1 - Apply standard scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df.iloc[:,0:3] = scaler.fit_transform(df.iloc[:,0:3])
     

In [None]:

# Step 2 - Find Covariance Matrix
covariance_matrix = np.cov([df.iloc[:,0],df.iloc[:,1],df.iloc[:,2]])
print('Covariance Matrix:\n', covariance_matrix)

In [None]:
# Step 3 - Finding eigen_values , eigen_vectors 

eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)


In [None]:
eigen_values

In [None]:
eigen_vectors

In [None]:
pc = eigen_vectors[0:2]
pc

In [None]:


transformed_df = np.dot(df.iloc[:,0:3],pc.T)
# original data 40,3
# pc 3,2
# after dot product 40,2
new_df = pd.DataFrame(transformed_df,columns=['PC1','PC2'])
new_df['target'] = df['target'].values
new_df.head()

In [None]:

new_df['target'] = new_df['target'].astype('str')
fig = px.scatter(x=new_df['PC1'],
                 y=new_df['PC2'],
                 color=new_df['target'],
                 color_discrete_sequence=px.colors.qualitative.G10
                )

fig.update_traces(marker=dict(size=12,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.show()
     