# Retail Dataset
Source: https://www.kaggle.com/datasets/census/retail-and-retailers-sales-time-series-collection

Implementation based off : https://www.kaggle.com/code/izzettunc/introduction-to-time-series-clustering


Idea is to apply clustering of time series data

In [1]:
# Native libraries
import os
import math
# Essential Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Preprocessing
from sklearn.preprocessing import MinMaxScaler
# Algorithms
from minisom import MiniSom
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA



In [2]:

directory = os.getcwd()+'/archive/'
mySeries = []
namesofMySeries = []
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        df = pd.read_csv(directory+filename)
        df = df.loc[:,["date","value"]]
        # While we are at it I just filtered the columns that we will be working on
        df.set_index("date",inplace=True)
        # ,set the date columns as index
        df.sort_index(inplace=True)
        # and lastly, ordered the data according to our date index
        mySeries.append(df)
        namesofMySeries.append(filename[:-4])

## Preprocessing

Normalizing and filling empty values

In [3]:
series_lengths = {len(series) for series in mySeries}
print(series_lengths)
ind = 0
for series in mySeries:
    print("["+str(ind)+"] "+series.index[0]+" "+series.index[len(series)-1])
    ind+=1

max_len = max(series_lengths)
longest_series = None
for series in mySeries:
    if len(series) == max_len:
        longest_series = series

problems_index = []

for i in range(len(mySeries)):
    if len(mySeries[i])!= max_len:
        problems_index.append(i)
        mySeries[i] = mySeries[i].reindex(longest_series.index)

def nan_counter(list_of_series):
    nan_polluted_series_counter = 0
    for series in list_of_series:
        if series.isnull().sum().sum() > 0:
            nan_polluted_series_counter+=1
    return nan_polluted_series_counter

{332, 333}
[0] 1992-02-01 2019-09-01
[1] 1992-01-01 2019-09-01
[2] 1992-01-01 2019-09-01
[3] 1992-01-01 2019-09-01
[4] 1992-01-01 2019-09-01
[5] 1992-01-01 2019-09-01
[6] 1992-01-01 2019-09-01
[7] 1992-01-01 2019-09-01
[8] 1992-01-01 2019-09-01
[9] 1992-01-01 2019-09-01
[10] 1992-01-01 2019-09-01
[11] 1992-01-01 2019-09-01
[12] 1992-01-01 2019-09-01
[13] 1992-01-01 2019-09-01
[14] 1992-01-01 2019-09-01
[15] 1992-01-01 2019-09-01
[16] 1992-01-01 2019-09-01
[17] 1992-01-01 2019-09-01
[18] 1992-01-01 2019-09-01
[19] 1992-02-01 2019-09-01
[20] 1992-02-01 2019-09-01
[21] 1992-01-01 2019-09-01
[22] 1992-01-01 2019-09-01


In [4]:
print("------Fix missing values------")
print(f"NaN values: {nan_counter(mySeries)}")
print("------Perform interpolation------")
for i in problems_index:
    mySeries[i].interpolate(limit_direction="both",inplace=True)
print(f"NaN values: {nan_counter(mySeries)}")

print("------Normalizing values------")
for i in range(len(mySeries)):
    scaler = MinMaxScaler()
    mySeries[i] = MinMaxScaler().fit_transform(mySeries[i])
    mySeries[i]= mySeries[i].reshape(len(mySeries[i]))
print("max: "+str(max(mySeries[0]))+"\tmin: "+str(min(mySeries[0])))
print(mySeries[0][:5])

------Fix missing values------
NaN values: 3
------Perform interpolation------
NaN values: 0
------Normalizing values------
max: 1.0	min: 0.0
[0.53953488 0.53953488 0.59627907 0.54697674 0.54139535]


# Clustering 

### SOM
Self-organizing maps are a type of neural network that is trained using unsupervised learning to produce a low-dimensional representation of the input space of the training samples, called a map.

In [5]:
som_x = som_y = math.ceil(math.sqrt(math.sqrt(len(mySeries))))
# I didn't see its significance but to make the map square,
# I calculated square root of map size which is 
# the square root of the number of series
# for the row and column counts of som

som = MiniSom(som_x, som_y,len(mySeries[0]), sigma=0.3, learning_rate = 0.1)

som.random_weights_init(mySeries)
som.train(mySeries, 50000)

In [6]:
cluster_map = []
for idx in range(len(mySeries)):
    winner_node = som.winner(mySeries[idx])
    cluster_map.append((namesofMySeries[idx],f"Cluster {winner_node[0]*som_y+winner_node[1]+1}"))

pd.DataFrame(cluster_map,columns=["Series","Cluster"]).sort_values(by="Cluster").set_index("Series")

Unnamed: 0_level_0,Cluster
Series,Unnamed: 1_level_1
MRTSSM444USS,Cluster 1
MRTSSM442USN,Cluster 1
MRTSSM442USS,Cluster 1
RETAILMPCSMSA,Cluster 2
RETAILIRSA,Cluster 2
MRTSSM448USS,Cluster 3
RETAILMPCSMNSA,Cluster 4
MRTSMPCSM4400CUSN,Cluster 4
MRTSSM44X72USS,Cluster 5
RETAILIMSA,Cluster 5


### Kmeans 

Using Dynamic Time Warping Matching


In [7]:
from tslearn.clustering import TimeSeriesKMeans

cluster_count = math.ceil(math.sqrt(len(mySeries))) 
# A good rule of thumb is choosing k as the square root of the number of points in the training data set in kNN

km = TimeSeriesKMeans(n_clusters=cluster_count, metric="dtw")

labels = km.fit_predict(mySeries)

In [8]:
labels

array([2, 0, 4, 4, 0, 4, 4, 0, 4, 0, 4, 0, 0, 4, 1, 0, 0, 0, 3, 2, 3, 4,
       0], dtype=int64)

In [9]:
fancy_names_for_labels = [f"Cluster {label}" for label in labels]
pd.DataFrame(zip(namesofMySeries,fancy_names_for_labels),columns=["Series","Cluster"]).sort_values(by="Cluster").set_index("Series")

Unnamed: 0_level_0,Cluster
Series,Unnamed: 1_level_1
MRTSSM448USS,Cluster 0
RETAILIMSA,Cluster 0
MRTSSM7221USN,Cluster 0
MRTSSM4541USS,Cluster 0
MRTSSM44X72USS,Cluster 0
MRTSSM44611USN,Cluster 0
MRTSSM444USS,Cluster 0
RETAILSMSA,Cluster 0
MRTSSM4413USS,Cluster 0
MRTSSM44000USS,Cluster 0


## AntClust 
Using Similarity based on Dynamic Time Warping Distance


In [21]:
import numpy as np
from fastdtw import fastdtw
# ----------------------
#       imports
# ----------------------

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
from importlib import reload
import distance_classes
reload(distance_classes)
# import the rule set
from rules import labroche_rules

def get_pair_wise_dist():# Calculate the DTW distance and alignment path
    distance_array = []
    for serie in mySeries:
        for serie2 in mySeries:
            distance, path = fastdtw(serie, serie2)
            distance_array.append(distance) 

    my_array = np.array(distance_array)
    min_value = np.min(my_array)
    max_value = np.max(my_array)
    return min_value, max_value

min, max = get_pair_wise_dist()

f_sim = [distance_classes.similarity_time_series(min, max)]
ant_clust = AntClust(f_sim, labroche_rules())
ant = [[serie] for serie in mySeries]
ant_clust.fit(ant)
clusters_found = ant_clust.get_clusters()

AntClust: phase 1 of 3 -> meeting ants
Meeting 1725 / 1725
Meeting 1380 / 1725
Meeting 1035 / 1725
Meeting 690 / 1725
Meeting 345 / 1725
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


#  Metrics

**Silhouette Score**
The silhouette score measures how well-separated the clusters are. It ranges from -1 to 1, where a higher value indicates better-defined clusters.
It considers both the distance between points within the same cluster and the distance between points in different clusters

**Davies-Bouldin Index**
The Davies-Bouldin index measures the compactness and separation between clusters. Lower values indicate better clustering.
It considers the average similarity ratio of each cluster with the cluster that is most similar to it.



In [22]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score


som_label = []
for idx in range(len(mySeries)):
    winner_node = som.winner(mySeries[idx])
    som_label.append(winner_node[0]*som_y+winner_node[1])

silhouette_avg_k = silhouette_score(mySeries, labels)
silhouette_avg_som = silhouette_score(mySeries, som_label)
silhouette_avg_ant = silhouette_score(mySeries, clusters_found)

db_index_k = davies_bouldin_score(mySeries, labels)
db_index_som = davies_bouldin_score(mySeries, som_label)
db_index_ant = davies_bouldin_score(mySeries, clusters_found)

print(f"Silhouette Score AntClust: {silhouette_avg_ant}")
print(f"Silhouette Score kmeans (k={cluster_count}): {silhouette_avg_k}")
print(f"Silhouette Score SOM: {silhouette_avg_som}")

print(f"Davies-Bouldin Index AntClust: {db_index_ant}")
print(f"Davies-Bouldin Index kmeans (k={cluster_count}): {db_index_k}")
print(f"Davies-Bouldin Index SOM: {db_index_som}")



Silhouette Score AntClust: 0.42147042898916864
Silhouette Score kmeans (k=5): 0.21701614139757278
Silhouette Score SOM: 0.18650587683541964
Davies-Bouldin Index AntClust: 0.7928378621103818
Davies-Bouldin Index kmeans (k=5): 1.1387750470950317
Davies-Bouldin Index SOM: 0.787120987840383


In [23]:
from sklearn import metrics
import pandas as pd
df = pd.DataFrame()


silhouette_avg_k = silhouette_score(mySeries, labels)
silhouette_avg_som = silhouette_score(mySeries, som_label)
silhouette_avg_ant = silhouette_score(mySeries, clusters_found)

db_index_k = davies_bouldin_score(mySeries, labels)
db_index_som = davies_bouldin_score(mySeries, som_label)
db_index_ant = davies_bouldin_score(mySeries, clusters_found)

new_row = pd.DataFrame({
    'Silhouette score': silhouette_avg_ant,
    'Davies-Bouldin Index': db_index_ant,
}, index=["AntClust (DTW distance)"])
df = pd.concat([df, new_row])

new_row = pd.DataFrame({
    'Silhouette score': silhouette_avg_k,
    'Davies-Bouldin Index': db_index_k,
}, index=[f"K means (k={cluster_count})"])
df = pd.concat([df, new_row])

new_row = pd.DataFrame({
    'Silhouette score': silhouette_avg_som,
    'Davies-Bouldin Index': db_index_som,
}, index=["Self Organizing Maps"])
df = pd.concat([df, new_row])

df


Unnamed: 0,Silhouette score,Davies-Bouldin Index
AntClust (DTW distance),0.42147,0.792838
K means (k=5),0.217016,1.138775
Self Organizing Maps,0.186506,0.787121


In [24]:
print(abs(db_index_som-db_index_ant))

0.005716874269998784
