___

# Machine Learning in Geosciences ] 
Department of Applied Geoinformatics and Carthography, Charles University

Lukas Brodsky lukas.brodsky@natur.cuni.cz


## Unsupervised learning


This notebook covers only an example of K-Means clustering. 


# Setup

In [None]:
# Common imports
import numpy as np
import os

# model
from sklearn.cluster import KMeans

# data  
from sklearn.datasets import make_blobs 

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns; sns.set()

# Project dir
PROJECT_DIR = "./"
if os.path.isdir(PROJECT_DIR):
    print('Ok continue.')
else:
    print('Nok, set correct path to your project directory!')

# Simulated data

In [None]:
X, y_true = make_blobs(n_samples = 500, centers = 4,
            cluster_std = 0.40, random_state = 0)

In [None]:
# centers = [[2,2],[4,5],[3,10]]
# X, _ = make_blobs(n_samples = 500, centers = centers, cluster_std = 1)

In [None]:
plt.scatter(X[:, 0], X[:, 1], s = 50);
plt.show()


# K-Means

In [None]:
kmeans = KMeans(n_clusters = 4)

In [None]:
kmeans.fit(X)

In [None]:
y_kmeans = kmeans.predict(X)
plt.scatter(X[:, 0], X[:, 1], c = y_kmeans, s = 50, cmap = 'viridis')

centers = kmeans.cluster_centers_

In [None]:
centers

In [None]:
plt.scatter(X[:, 0], X[:, 1], c = y_kmeans, s = 50, cmap = 'viridis')
plt.scatter(centers[:, 0], centers[:, 1], c = 'black', s = 200, alpha = 0.5);
plt.show()

### Measuring the Clustering Performance

Silhouette Analysis

This score is a metric that measures how close each point in one cluster is to the points in the neighboring clusters.
Analysis of silhouette score

The score has a range of [-1, 1]. Following is the analysis of this score −

    Score of +1 − Score near +1 indicates that the sample is far away from the neighboring cluster.

    Score of 0 − Score 0 indicates that the sample is on or very close to the decision boundary between two neighboring clusters.

    Score of -1 − Negative score indicates that the samples have been assigned to the wrong clusters.


In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:
# run the analysis over list of k values
# range_n_clusters = [2, 3, 4, 5, 6]

In [None]:
n_clusters = 2

In [None]:
clusterer = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = clusterer.fit_predict(X)

In [None]:
# silhouette_score gives the average value for all the samples.
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
        "The average silhouette_score is :", silhouette_avg)

In [None]:
# silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)

In [None]:
np.mean(sample_silhouette_values)

In [None]:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

y_lower = 10
for i in range(n_clusters):
    # Aggregate the silhouette scores for samples belonging to
    # cluster i, and sort them
    ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

    ith_cluster_silhouette_values.sort()

    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i

    color = cm.nipy_spectral(float(i) / n_clusters)
    ax1.fill_betweenx(
        np.arange(y_lower, y_upper),
        0,
        ith_cluster_silhouette_values,
        facecolor=color,
        edgecolor=color,
        alpha=0.7,
    )
    
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10  # 10 for the 0 samples
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    
    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(
        X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
    )
    