### Hierachical clustering in SciPy

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster
from matplotlib import pyplot as plt
import seaborn as sns, pandas as pd

x_coordinates = [80.1, 93.1, 86.6, 98.5, 86.4, 9.5, 15.2, 3.4,
10.4, 20.3, 44.2, 56.8, 49.2, 62.5, 44.0]
y_coordinates = [87.2, 96.1, 95.6, 92.4, 92.4, 57.7, 49.4,
47.3, 59.1, 55.5, 25.6, 2.1, 10.9, 24.1, 10.3]

df = pd.DataFrame({'x_coordinate': x_coordinates,
'y_coordinate': y_coordinates})

Z = linkage(df, 'ward')
df['cluster_labels'] = fcluster(Z, 3, criterion='maxclust')

sns.scatterplot(x='x_coordinate', y='y_coordinate',
hue='cluster_labels', data = df)
plt.show()

### K-means clustering in SciPy

In [None]:
from scipy.cluster.vq import kmeans, vq
from matplotlib import pyplot as plt
import seaborn as sns, pandas as pd
import random
random.seed((1000,2000))

x_coordinates = [80.1, 93.1, 86.6, 98.5, 86.4, 9.5, 15.2, 3.4,
10.4, 20.3, 44.2, 56.8, 49.2, 62.5, 44.0]
y_coordinates = [87.2, 96.1, 95.6, 92.4, 92.4, 57.7, 49.4,
47.3, 59.1, 55.5, 25.6, 2.1, 10.9, 24.1, 10.3]
df = pd.DataFrame({'x_coordinate': x_coordinates, 'y_coordinate': y_coordinates})

centroids,_ = kmeans(df, 3)
df['cluster_labels'], _ = vq(df, centroids)

sns.scatterplot(x='x_coordinate', y='y_coordinate',
hue='cluster_labels', data = df)
plt.show()

### Normalization of data before clustering

In [None]:
from scipy.cluster.vq import whiten
data = [5, 1, 3, 3, 2, 3, 3, 8, 1, 2, 2, 3, 5]
scaled_data = whiten(data)

## Hierarchical Clustering
### Linkage Function
- 'single' : based on two closest objects
- 'complete' : based on two farthest objects
- 'average' : based on the arithmetic mean of all objects
- 'centroid' : based on the geometric mean of all objects
- 'median' : based on the median of all objects
- 'ward' : based on the sum of squares

In [None]:
scipy.cluster.hierarchy.linkage(observations,
                                method='single',
                                metric='euclidean',
                                optimal_ordering=False
)

#### Create cluster labels with fcluster

In [None]:
scipy.cluster.hierarchy.fcluster(distance_matrix, num_clusters, criterion)

### Visualize clusters
    - with matplotlib

In [None]:
from matplotlib import pyplot as plt
df = pd.DataFrame({'x': [2, 3, 5, 6, 2],
                   'y': [1, 1, 5, 5, 2],
                   'labels': ['A', 'A', 'B', 'B', 'A']})
colors = {'A':'red', 'B':'blue'}
df.plot.scatter(x='x', y='y', c=df['labels'].apply(lambda x: colors[x]))
plt.show()

    - with seaborn

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

df = pd.DataFrame({'x': [2, 3, 5, 6, 2],
                   'y': [1, 1, 5, 5, 2],
                   'labels': ['A', 'A', 'B', 'B', 'A']})
sns.scatterplot(x='x', y='y', hue='labels', data=df)
plt.show()

### Dendrogram in SciPy

In [None]:
from scipy.cluster.hierarchy import dendrogram
Z = linkage(df[['x_whiten', 'y_whiten']],
            method='ward',
            metric='euclidean')
dn = dendrogram(Z)
plt.show()

## K-Means Clustering
    - Step 1: Generate cluster centers
        - kmeans(obs, k_or_guess, iter, thresh, check_finite)
        - obs : standardized observations
        - k_or_guess : number of clusters
        - iter : number of iterations (default: 20)
        - thres : threshold (default: 1e-05)
        - check_finite : whether to check if observations contain only 
    - Step 2: Generate cluster labels
        - vq(obs, code_book, check_finite=True)
        - obs : standardized observations
        - code_book : cluster centers
        - check_finite : whether to check if observations contain only 

In [None]:
# Import kmeans and vq functions
from scipy.cluster.vq import kmeans, vq

# Generate cluster centers and labels
cluster_centers, _ = kmeans(df[['scaled_x', 'scaled_y']], 3)
df['cluster_labels'], _ = vq(df[['scaled_x', 'scaled_y']], cluster_centers)

# Plot clusters
sns.scatterplot(x='scaled_x', y='scaled_y', hue='cluster_labels', data=df)
plt.show()

### Elbow plot

In [None]:
# Declaring variables for use
distortions = []
num_clusters = range(2, 7)

# Populating distortions for various clusters
for i in num_clusters:
centroids, distortion = kmeans(df[['scaled_x', 'scaled_y']], i)
distortions.append(distortion)

# Plotting elbow plot data
elbow_plot_data = pd.DataFrame({'num_clusters': num_clusters, 'distortions': distortions})
sns.lineplot(x='num_clusters', y='distortions', data = elbow_plot_data)
plt.show()

## Example 1: find dominant colors
- Convert image to pixels: matplotlib.image.imread
- Display colors of cluster centers: matplotlib.pyplot.imshow

#### 1. Covert image to RGB matrix

In [None]:
import matplotlib.image as img
image = img.imread('sea.jpg')
image.shape   # (475, 764, 3)

r = []
g = []
b = []

for row in image:
    for pixel in row:
# A pixel contains RGB values
    temp_r, temp_g, temp_b = pixel
    r.append(temp_r)
    g.append(temp_g)
    b.append(temp_b)

#### 2. DataFrame with RGB values

In [None]:
pixels = pd.DataFrame({'red': r, 'blue': b, 'green': g})
pixels.head()

##############
red blue green
252 255 252
75 103 81

#### 3. Create an elbow plot

In [None]:
distortions = []
num_clusters = range(1, 11)

# Create a list of distortions from the kmeans method
for i in num_clusters:
    cluster_centers, _ = kmeans(pixels[['scaled_red', 'scaled_blue','scaled_green']], i)
    distortions.append(distortion)
    
# Create a DataFrame with two lists - number of clusters and distortions
elbow_plot = pd.DataFrame({'num_clusters': num_clusters, 'distortions': distortions})

# Creat a line plot of num_clusters and distortions
sns.lineplot(x='num_clusters', y='distortions', data = elbow_plot)
plt.xticks(num_clusters)
plt.show()

#### 4. Find dominant colors

In [None]:
cluster_centers, _ = kmeans(pixels[['scaled_red', 'scaled_blue', 'scaled_green']], 2)

colors = []

# Find Standard Deviations
r_std, g_std, b_std = pixels[['red', 'blue', 'green']].std()

# Scale actual RGB values in range of 0-1
for cluster_center in cluster_centers:
    scaled_r, scaled_g, scaled_b = cluster_center
    colors.append((
        scaled_r * r_std/255,
        scaled_g * g_std/255,
        scaled_b * b_std/255
    ))

#### 5. Display dominant colors

In [None]:
#Dimensions: 2 x 3 (N X 3 matrix)
print(colors)

#[(0.08192923122023911, 0.34205845943857993, 0.2824002984155429),
#(0.893281510956742, 0.899818770315129, 0.8979114272960784)]

#Dimensions: 1 x 2 x 3 (1 X N x 3 matrix)
plt.imshow([colors])
plt.show()

### Example 2: Document clustering
1. Clean data before processing
2. Determine the importance of the terms in a document (in TF-IDF matrix)
3. Cluster the TF-IDF matrix
4. Find top terms, documents in each cluster

#### 1. Clean and tokenize data

In [None]:
from nltk.tokenize import word_tokenize
import re

def remove_noise(text, stop_words = []):
    tokens = word_tokenize(text)
    cleaned_tokens = []
    for token in tokens:
        token = re.sub('[^A-Za-z0-9]+', '', token)
        if len(token) > 1 and token.lower() not in stop_words:
            # Get lowercase
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

remove_noise("It is lovely weather we are having.I hope the weather continues.")

# ['lovely', 'weather', 'hope', 'weather', 'continues']

#### 2. TF-IDF (Term Frequency - Inverse Document Frequency)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=50, min_df=0.2, tokenizer=remove_noise)
tfidf_matrix = tfidf_vectorizer.fit_transform(data)

#### 3. Clustering with sparse matrix

In [None]:
cluster_centers, distortion = kmeans(tfidf_matrix.todense(), num_clusters)
# kmeans() in SciPy does not support sparse matrices, Use .todense() to convert to a matrix

#### 4. Top terms per cluster
- Cluster centers: lists with a size equal to the number of terms
- Each value in the cluster center is its importance
- Create a dictionary and print top terms

In [None]:
terms = tfidf_vectorizer.get_feature_names_out()

for i in range(num_clusters):
    center_terms = dict(zip(terms, list(cluster_centers[i])))
    sorted_terms = sorted(center_terms, key=center_terms.get, reverse=True)
    print(sorted_terms[:3])
    
#['room', 'hotel', 'staff']
#['bad', 'location', 'breakfast']