<a href="https://colab.research.google.com/github/ge43jef/GEEHYDRO/blob/block4/Kmeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clustering : K-means
The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares (see below). This algorithm requires the number of clusters to be specified. It scales well to large numbers of samples and has been used across a large range of application areas in many different fields.

In [None]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## K-means with example dataset

In [None]:
# Generate sample data
n_samples = 4000
n_components = 4

X, y_true = make_blobs(
    n_samples=n_samples, centers=n_components, cluster_std=0.60, random_state=0
)
X = X[:, ::-1]

# Fit the model
kmeans = KMeans(init="random", n_clusters=6, n_init=6, random_state=0)
kmeans.fit(X)


plt.figure(1)
centers = kmeans.cluster_centers_
plt.scatter(X[: , 0], X[: , 1], c=kmeans.labels_)
plt.scatter(centers[:, 0], centers[:, 1], c="r")
plt.title("K-Means")
plt.xticks([])
plt.yticks([])
plt.show()

## K-means with camels dataset
In this lab, we will cluster the catchments into several different groups according to location information

### Load the data

In [None]:
with open('camels_topo.txt') as f:
    lines = f.readlines()

num_of_rows = len(lines)

var = np.zeros((num_of_rows - 1 , 4)) # save first fourth variables
                                      # in the files

for num in range(1 , num_of_rows): # we don't need the first row
    xx = lines[num] # variable to save each line of lines
    l = []
    for t in xx.split(';'):
        try:
            l.append(float(t))
        except ValueError:
            pass
    var[num - 1 , :] = l[0 : 4]


fig, ax = plt.subplots(1, 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)
ax.scatter(var[: , 2] , var[: , 1] , c=var[: , 3], cmap=plt.cm.coolwarm, s=20, edgecolors="k")
ax.set_title("catchments with elevation(m)")

### Fit the model

In [None]:
kmeans = KMeans(init="random", n_clusters=6, n_init=6, random_state=0)
kmeans.fit(var[: , 1 : 3])

# Plot init seeds along side sample data
plt.figure(1)
centers = kmeans.cluster_centers_
plt.scatter(var[: , 2], var[: , 1], c=kmeans.labels_)
plt.scatter(centers[:, 1], centers[:, 0], c="r")
plt.title("K-Means")
plt.xticks([])
plt.yticks([])
plt.show()