In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sktime.split import temporal_train_test_split
from tslearn.clustering import TimeSeriesKMeans
from sklearn.metrics import silhouette_score 
from tslearn.metrics import cdist_dtw
from sklearn.metrics import silhouette_samples

In [2]:
#import data
data = np.random.randn(200, 100)
print(data.shape)

#pad data if variable lengths

(200, 100)


In [3]:
#train test split 
y_train, y_test = temporal_train_test_split(data, test_size=.2)

print(y_train.shape)
print(y_test.shape)

(160, 100)
(40, 100)


In [4]:
#deseasonalize/de trend if performing analysis on striclty seasonality/trends
#standardize/scale

scaler = StandardScaler()
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)

print(y_train)
print(y_test)

[[-1.25994451 -1.40251005  0.10357349 ... -0.51706141  1.20998185
   0.57125475]
 [ 0.05933954  0.47283636 -0.89429096 ...  0.17435531 -1.2214276
  -0.45524795]
 [ 1.33769614  0.73472658 -0.92284566 ... -0.68956693  0.53863934
   0.61448909]
 ...
 [-0.52231002 -0.10090618  0.03163594 ...  1.19379401 -1.64012631
  -1.33617009]
 [-0.83132363 -1.80757687  0.19205389 ...  0.09257801  0.94704277
   1.20718713]
 [-0.78978866 -0.91675011 -0.73623366 ...  0.74689718 -0.79875544
  -1.09601942]]
[[-1.80324056  0.45335032 -0.83717987 ...  0.44045726  0.53080232
   1.58429734]
 [-0.54465626 -0.53299943  0.92182471 ... -0.18464008 -0.81999529
   0.53776944]
 [ 1.84465048 -0.70830053 -0.93905554 ...  0.85005005  0.00534931
   0.34555675]
 ...
 [ 0.48509708 -0.89887194 -1.97145985 ... -0.55233619  0.21804499
   0.2300928 ]
 [ 0.15634534  0.07705751  2.06279301 ...  1.66513573  0.09107428
  -0.00384739]
 [ 0.8946309  -0.08958008  1.72502947 ...  0.14098531  0.44346784
   0.02655891]]


In [5]:
#fit model 

model = TimeSeriesKMeans(n_clusters=3, metric="dtw")
labels = model.fit_predict(y_train)

print(labels)

[2 2 1 2 2 1 2 1 1 2 2 2 2 2 2 1 1 2 0 1 2 1 1 1 1 2 2 2 2 1 2 1 1 1 2 1 1
 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 1 1 2 1 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2
 2 1 2 1 2 1 2 1 2 2 2 2 1 2 1 2 2 2 2 1 2 2 2 1 2 1 2 2 1 2 2 2 1 1 2 1 2
 2 2 2 1 2 2 2 2 1 1 2 2 1 2 1 2 2 2 2 1 2 2 1 1 2 2 2 2 2 2 2 1 1 2 2 1 1
 2 2 1 1 2 1 1 2 1 1 2 2]


In [6]:
#fit testing data
test_labels = model.predict(y_test)

#create dataframe to store labels 
df = pd.DataFrame(data=y_test)

df["label"] = test_labels

print(df)

           0         1         2         3         4         5         6  \
0  -1.803241  0.453350 -0.837180 -1.716092  0.796584  0.199812 -0.862924   
1  -0.544656 -0.532999  0.921825 -0.207523 -0.249960  0.593966  1.633711   
2   1.844650 -0.708301 -0.939056 -0.299982 -1.895328 -0.602346 -0.370402   
3   0.437819  1.758465 -0.670903 -0.279062  1.368162  0.635542 -1.189355   
4  -0.254985  0.090524 -1.334867 -0.490547 -1.248104 -1.166029 -0.066906   
5  -0.626798  0.150209  0.408644 -1.507230 -0.184725 -0.273701  0.362110   
6  -0.889002 -1.533300  0.454392  1.565091  0.664372  0.863262  0.530532   
7   2.194324  0.719668 -2.777674 -0.413259 -0.591730  0.511507  0.639331   
8  -0.805496 -1.197711  1.408063  2.384072 -0.127435 -0.426045 -0.322201   
9  -0.944884  1.610666 -1.411987 -0.925070  0.257153 -0.604472 -0.732589   
10  1.791987  0.176671  1.028360 -0.328371 -0.154952 -0.328917 -0.358293   
11  0.794541  0.189544 -1.257913 -0.390810  0.935651  0.437581  0.659843   
12  0.350478



In [7]:
# Sillohouette score -  If the Silhouette index value is high, the object is well-matched to its own cluster and poorly matched to neighbouring clusters.
#can also be used as cross validation to find the optimal number of clusters

#compute dtw matrix for distance between each time series
dist = cdist_dtw(y_train)

print(dist.shape)

#overall sillouette score
silhouette_avg = silhouette_score(dist,labels, metric="precomputed") 

print(f"Overall Sillhouette Score: {silhouette_avg}")


(160, 160)
Overall Sillhouette Score: 0.012496902308208073


In [8]:
#display individual sillhoutte scores from test data 

samples = silhouette_samples(y_test, test_labels) 

print(list(zip(samples, test_labels)))

[(0.0043917093957964205, 2), (-0.014833033357285032, 2), (0.0013547293978134556, 1), (0.013242612766333588, 2), (0.02332581154765725, 2), (0.009899908912991773, 2), (0.021231688835279604, 2), (0.0016008208207129331, 1), (0.035597213941747904, 2), (-0.01913374133594953, 1), (0.0012259856753241856, 1), (-0.0004755470136727876, 2), (-0.03001824447876127, 2), (0.02525033106826314, 1), (0.005007239149330751, 2), (-0.008067386549401125, 2), (-0.020586091226475547, 2), (-0.020292661218519657, 2), (0.0073149962727771765, 2), (0.034579348178545105, 2), (0.028473769920955655, 2), (0.002333117172172591, 2), (0.0035200083143357807, 2), (0.02641903257813137, 1), (-0.015953882576699847, 1), (0.008965329334109618, 2), (0.02682608861749361, 2), (0.049960181547998174, 2), (0.03107273715273952, 2), (0.012007638624629804, 1), (0.0058049863144635375, 1), (-0.0028531890416770927, 2), (0.014511976351055904, 2), (0.0018358817817478782, 1), (0.012782473893594078, 1), (0.007147105646890686, 1), (0.033065998284