# Clustering analysis of ground motion time series

In this notebook, I will analyse a test area of Norway from the Norwat InSAR ground motion data.

This will include timeseries for each of the points. The idea is that timeseries patterns can be grouped together, so we want to identify how this classification works. 
Following the tutorial from [here](https://www.kaggle.com/izzettunc/introduction-to-time-series-clustering)

In [None]:
# Native libraries
import os
import math
# Essential Libraries
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from ipywidgets import *
# Preprocessing
from sklearn.preprocessing import MinMaxScaler
# Algorithms
from minisom import MiniSom
from tslearn.barycenters import dtw_barycenter_averaging
from tslearn.clustering import TimeSeriesKMeans
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

Select a test area to download data from - currently in the `./tmp/` folder

In [None]:
gm_csv = pd.read_csv('./tmp/160-IW1-414-s1-asc1-v2020.csv')

In [None]:
# remove whitespaces from column names 
gm_csv.rename(columns=lambda x: x.strip(), inplace=True)

In [None]:
#transposer - rowes are columns and vice versa
gm_csv_t = gm_csv.T

In [None]:
# set the 'pid' as the column name
gm_csv_t.rename(columns=gm_csv_t.iloc[0], inplace=True)

In [None]:
#remove all rows which are not dates - first date is on entry #21
gm_csv_t_dates= gm_csv_t.iloc[21: , :]

In [None]:
gm_csv_t_dates.columns = gm_csv_t_dates.columns.astype(str)

Check for outliers or anomalous time series

In [None]:
# select rows that have values above a certain threshold
gm_csv_t_dates[gm_csv_t_dates.columns[(gm_csv_t_dates>4000).any()]]


In [None]:

### WIDGET to visualise the timeseries
plt.rcParams['figure.figsize'] = [7,4]

column_name = gm_csv_t_dates.columns

def plot_data(column_number):
    plt.plot(gm_csv_t_dates.iloc[:,column_number].values)
    
interact(plot_data, column_number=(0,len(column_name)-1,1))

In [None]:
# convert each of the columns into a pandas series 
# for now let's assume that the data has no gaps and that all columns are the same length

In [None]:
# check for nan values and for columns of different length
all_column_lengths = []
columns_with_nan_values = []
for i in range(len(gm_csv_t_dates)): 
    all_column_lengths.append(len(gm_csv_t_dates.iloc[:, i]))
    if gm_csv_t_dates[gm_csv_t_dates.columns[i]].isnull().values.any() == True:
        columns_with_nan_values.append(gm_csv_t_dates.columns[i])

In [None]:
# check if the length of all columns is the same
np.unique(np.array(all_column_lengths))

In [None]:
# check if there are any columns with Nan values
columns_with_nan_values

In [None]:
#need to make a copy of the dataframe so that I don't mess it up
gm_copy = gm_csv_t_dates.copy()

In [None]:
# this is the orginal copy of the dataframe - without scaling
gm_copy_of_copy = gm_copy.copy()

#### CHANGE TO SERIES!!!! 
this is better for the classification tasks

In [None]:
mySeries = []
for i in range(len(gm_copy.columns)):
    df = pd.DataFrame(gm_copy.iloc[:,i])
    mySeries.append(df)
    

In [None]:
df = pd.DataFrame(gm_copy.iloc[:,1])

In [None]:
mySeries[1]

In [None]:
# need to normalise the timeseries columns before doing any clustering

for i in range(len(mySeries)):
    scaler = MinMaxScaler(feature_range=(-1,1))
    mySeries[i] = MinMaxScaler(feature_range=(-1,1)).fit_transform(mySeries[i])
    mySeries[i]= mySeries[i].reshape(len(mySeries[i]))

In [None]:
print("max: "+str(max(mySeries[0]))+"\tmin: "+str(min(mySeries[0])))
print(mySeries[0][:5])

In [None]:
# set the size of the grid. For example: som_x = som_y = 2 would give 4 clusters
som_x = som_y = math.ceil(math.sqrt(math.sqrt(len(mySeries))))


In [None]:
# ATTENTION: these parameters have not been optimised!!
som = MiniSom(som_x, som_y,len(mySeries[0]), sigma=0.3, learning_rate = 0.1)

som.random_weights_init(mySeries)
som.train(mySeries, 50000)

In [None]:
# Little handy function to plot series
def plot_som_series_averaged_center(som_x, som_y, win_map):
    fig, axs = plt.subplots(som_x,som_y,figsize=(25,25))
    fig.suptitle('Clusters')
    for x in range(som_x):
        for y in range(som_y):
            cluster = (x,y)
            if cluster in win_map.keys():
                for series in win_map[cluster]:
                    axs[cluster].plot(series,c="gray",alpha=0.5) 
                axs[cluster].plot(np.average(np.vstack(win_map[cluster]),axis=0),c="red")
            cluster_number = x*som_y+y+1
            axs[cluster].set_title(f"Cluster {cluster_number}")

    plt.show()

In [None]:
win_map = som.win_map(mySeries)
# Returns the mapping of the winner nodes and inputs

plot_som_series_averaged_center(som_x, som_y, win_map)

In [None]:
win_map = som.win_map(mySeries)

#plot_som_series_dba_center(som_x, som_y, win_map)

In [None]:
cluster_c = []
cluster_n = []
for x in range(som_x):
    for y in range(som_y):
        cluster = (x,y)
        if cluster in win_map.keys():
            cluster_c.append(len(win_map[cluster]))
        else:
            cluster_c.append(0)
        cluster_number = x*som_y+y+1
        cluster_n.append(f"Cluster {cluster_number}")

plt.figure(figsize=(25,5))
plt.title("Cluster Distribution for SOM")
plt.bar(cluster_n,cluster_c)
plt.show()

#### K-means clustering

In [None]:

cluster_count = math.ceil(math.sqrt(len(mySeries))) 
# A good rule of thumb is choosing k as the square root of the number of points in the training data set in kNN

km = TimeSeriesKMeans(n_clusters=cluster_count, metric="dtw")

labels = km.fit_predict(mySeries)



In [None]:
plot_count = math.ceil(math.sqrt(cluster_count))

fig, axs = plt.subplots(plot_count,plot_count,figsize=(25,25))
fig.suptitle('Clusters')
row_i=0
column_j=0
# For each label there is,
# plots every series with that label
for label in set(labels):
    cluster = []
    for i in range(len(labels)):
            if(labels[i]==label):
                axs[row_i, column_j].plot(mySeries[i],c="gray",alpha=0.4)
                cluster.append(mySeries[i])
    if len(cluster) > 0:
        axs[row_i, column_j].plot(np.average(np.vstack(cluster),axis=0),c="red")
    axs[row_i, column_j].set_title("Cluster "+str(row_i*som_y+column_j))
    column_j+=1
    if column_j%plot_count == 0:
        row_i+=1
        column_j=0
        
plt.show()