# Alexine Studios

## Download Dataset

In [None]:
!mkdir datasets
!wget -qq https://raw.githubusercontent.com/chiruharshith/Alexine_Studios/main/datasets/Iris.csv -P datasets
!wget -qq https://raw.githubusercontent.com/chiruharshith/Alexine_Studios/main/datasets/datapoints.csv -P datasets

In [None]:
# importing required packages
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

#### Calculate the Euclidean distance for the given two points

In [None]:
# [x1, y1], [x2, y2]
p1,p2= [1,-5],[2,4]

x1 = p1[0]
y1 = p1[1]

x2 = p2[0]
y2 = p2[1]

np.sqrt(((x1-x2)**2) + ((y1-y2)**2))

In [None]:
# [x1, y1], [x2, y2]
p1,p2= [2,3],[5,6]

x1 = p1[0]
y1 = p1[1]

x2 = p2[0]
y2 = p2[1]

np.sqrt(((x1-x2)**2) + ((y1-y2)**2))

In [None]:
from scipy.spatial import distance
distance.euclidean(p1,p2)

In [None]:
p1 = np.array(p1)
p2 = np.array(p2)

np.linalg.norm(p1-p2)

In [None]:
dataset = "datasets/datapoints.csv"

df = pd.read_csv(dataset)
df

In [None]:
plt.scatter(df['V1'], df['V2'], c='red', s=7);

#### Randomly choose the k cluster centers from the data loaded above (k is 3)



1.   Choose random values within the range of max of V1 by giving size = 3 and consider as X-axis
2.   Choose random values within the range of max of V2 by giving size = 3 and consider as Y-axis 
3.   Combine X and Y, and denote 'C' as cluster centers (datatype should be  float). Combine using zip.


In [None]:
V1 = df['V1'].values
V2 = df['V2'].values
len(V1), len(V2)

In [None]:
max(V1), max(V2)

In [None]:
k = 3

x = np.random.randint(0, max(V1), size=k)
y = np.random.randint(0, max(V2), size=k)
centroids = np.array(list(zip(x,y)), dtype=np.float32)
centroids

In [None]:
plt.scatter(V1, V2, c='black', marker='.',s=5, alpha=0.3)
plt.scatter(x,y, marker='*', c='blue', s=200)
plt.show()

#### Find the nearest cluster for one data point

1.   Randomly select initial data point (index) from the above loaded dataset 
2.   Get the V1 and V2 values of the initial data point (say f1 and f2)
3.   Calculate the distance between the chosen point (f1,f2) to all the selected k cluster center point
4.   Find which cluster center is nearest to the initial data point (f1,f2)





In [None]:
df.shape[0]

In [None]:
df.head()

In [None]:
f1, f2 = df.iloc[np.random.randint(0,3000),:]
f1,f2

In [None]:
centroids

In [None]:
[f1,f2]

In [None]:
from scipy.spatial import distance

def euclidian_distance(p1, p2):
    return distance.euclidean(p1,p2)

In [None]:
f1, f2 = df.iloc[np.random.randint(0,3000),:]

closest_centroid_index = 0
nearest_distance = float("inf")

for index, each_point in enumerate(centroids):
    actual_distance = euclidian_distance([f1,f2], each_point)
    if actual_distance < nearest_distance:
        nearest_distance = actual_distance
        closest_centroid_index = index

print(closest_centroid_index, nearest_distance)

colors = ['red','green','blue']
plt.scatter(V1, V2, c='black', marker='.',s=5, alpha=0.1)
plt.scatter(x,y, marker='*', c=colors, s=200)
plt.scatter(f1,f2, c=colors[closest_centroid_index])
plt.show()

#### Apply the K-means algorithm

In [None]:
from sklearn.cluster import KMeans

clf = KMeans(n_clusters=2)
labels = clf.fit_predict(df[['V1','V2']])

In [None]:
labels

In [None]:
len(labels)

In [None]:
set(labels)

#### Elbow method


1. With k values ranging from 1-10, fit K-means model with the data
2. Store the distortions of the K-means model (model.inertia_)
3. Plot the distortions and observe elbow point

In [None]:
cluster_variation = []

k_range = range(1,11)

for k in k_range:
    clf = KMeans(n_clusters=k)
    clf.fit(df[['V1','V2']])
    cluster_variation.append(clf.inertia_)

In [None]:
cluster_variation

In [None]:
plt.plot(k_range, cluster_variation, 'bx-')
plt.xlabel('k value')
plt.ylabel('Inertia')
plt.title('The Elbow method showing the optimal K')
plt.show()

#### Perform the K-means algorithm


1.   Find the best k value from the above elbow method 
2.   Fit and predict the features with the K-means algorithm and get the label for each feature

In [None]:
clf = KMeans(n_clusters=3)
labels = clf.fit_predict(df[['V1','V2']])

In [None]:
set(labels)

In [None]:
centers = clf.cluster_centers_
cx = centers[:, 0]
cy = centers[:, 1]

In [None]:
plt.scatter(V1, V2, c=labels, cmap='viridis', marker='.', s=5, alpha=0.5)
plt.scatter(cx, cy, marker='*', c='red', s=200,alpha=0.7)
plt.show()

#### Plot the Dendrogram using V1 and V2 values from the given data

In [None]:
import scipy.cluster.hierarchy as sch 

plt.figure(figsize=(20,15))
sch.dendrogram(sch.linkage(df[['V1','V2']], method='ward'))
plt.show()

#### Agglomerative clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
labels = model.fit_predict(df[['V1','V2']])

In [None]:
labels

In [None]:
model.children_