In [1]:
import os
import numpy as np
import pandas as pd
import scipy.stats as ss
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
from distutils.util import strtobool
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

%matplotlib inline
warnings.filterwarnings('ignore')

# .tsf to DataFrame func
from [TSForecasting/utils/data_loader.py](https://github.com/rakshitha123/TSForecasting/blob/master/utils/data_loader.py)

In [2]:
# Функция с Гитхаба
def convert_tsf_to_dataframe(
    full_file_path_and_name,
    replace_missing_vals_with="NaN",
    value_column_name="series_value",
):
    col_names = []
    col_types = []
    all_data = {}
    line_count = 0
    frequency = None
    forecast_horizon = None
    contain_missing_values = None
    contain_equal_length = None
    found_data_tag = False
    found_data_section = False
    started_reading_data_section = False

    with open(full_file_path_and_name, "r", encoding="cp1252") as file:
        for line in file:
            # Strip white space from start/end of line
            line = line.strip()

            if line:
                if line.startswith("@"):  # Read meta-data
                    if not line.startswith("@data"):
                        line_content = line.split(" ")
                        if line.startswith("@attribute"):
                            if (
                                len(line_content) != 3
                            ):  # Attributes have both name and type
                                raise Exception("Invalid meta-data specification.")

                            col_names.append(line_content[1])
                            col_types.append(line_content[2])
                        else:
                            if (
                                len(line_content) != 2
                            ):  # Other meta-data have only values
                                raise Exception("Invalid meta-data specification.")

                            if line.startswith("@frequency"):
                                frequency = line_content[1]
                            elif line.startswith("@horizon"):
                                forecast_horizon = int(line_content[1])
                            elif line.startswith("@missing"):
                                contain_missing_values = bool(
                                    strtobool(line_content[1])
                                )
                            elif line.startswith("@equallength"):
                                contain_equal_length = bool(strtobool(line_content[1]))

                    else:
                        if len(col_names) == 0:
                            raise Exception(
                                "Missing attribute section. Attribute section must come before data."
                            )

                        found_data_tag = True
                elif not line.startswith("#"):
                    if len(col_names) == 0:
                        raise Exception(
                            "Missing attribute section. Attribute section must come before data."
                        )
                    elif not found_data_tag:
                        raise Exception("Missing @data tag.")
                    else:
                        if not started_reading_data_section:
                            started_reading_data_section = True
                            found_data_section = True
                            all_series = []

                            for col in col_names:
                                all_data[col] = []

                        full_info = line.split(":")

                        if len(full_info) != (len(col_names) + 1):
                            raise Exception("Missing attributes/values in series.")

                        series = full_info[len(full_info) - 1]
                        series = series.split(",")

                        if len(series) == 0:
                            raise Exception(
                                "A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series. Missing values should be indicated with ? symbol"
                            )

                        numeric_series = []

                        for val in series:
                            if val == "?":
                                numeric_series.append(replace_missing_vals_with)
                            else:
                                numeric_series.append(float(val))

                        if numeric_series.count(replace_missing_vals_with) == len(
                            numeric_series
                        ):
                            raise Exception(
                                "All series values are missing. A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series."
                            )

                        all_series.append(pd.Series(numeric_series).array)

                        for i in range(len(col_names)):
                            att_val = None
                            if col_types[i] == "numeric":
                                att_val = int(full_info[i])
                            elif col_types[i] == "string":
                                att_val = str(full_info[i])
                            elif col_types[i] == "date":
                                att_val = datetime.strptime(
                                    full_info[i], "%Y-%m-%d %H-%M-%S"
                                )
                            else:
                                raise Exception(
                                    "Invalid attribute type."
                                )  # Currently, the code supports only numeric, string and date types. Extend this as required.

                            if att_val is None:
                                raise Exception("Invalid attribute value.")
                            else:
                                all_data[col_names[i]].append(att_val)

                line_count = line_count + 1

        if line_count == 0:
            raise Exception("Empty file.")
        if len(col_names) == 0:
            raise Exception("Missing attribute section.")
        if not found_data_section:
            raise Exception("Missing series information under data section.")

        all_data[value_column_name] = all_series
        loaded_data = pd.DataFrame(all_data)

        return (
            loaded_data,
            frequency,
            forecast_horizon,
            contain_missing_values,
            contain_equal_length,
        )

# TS clustering func

In [3]:
def cluster_ts(data:[pd.DataFrame], n_clusters:[int]):

    # Преобразуем список временных рядов в датафрейм
    clustering_data_scaled = pd.DataFrame(data['series_value'].tolist())

    # Масштабируем данные
    ss = StandardScaler()
    clustering_data_scaled = ss.fit_transform(clustering_data_scaled)

    # Кластеризуем TS
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(clustering_data_scaled)

    # Добавляем новый столбец в исходный датафрейм
    data['cluster'] = clusters

    return data

# Plot clustered TS func

In [5]:
def plot_clustered_series(data:[pd.DataFrame], n_clusters:[int], frequency:[str]):
    
    '''
    frequency должна быть как в доке Пандаса:
    https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases
    
    В данном случае frequency: 'W' = weekly
    
    '''
    start_date = data['start_timestamp'].min()
    end_date = data['series_value'].max()

    # Создаём диапазон дат с еженедельной частотой
    date_range = pd.date_range(start=start_date, periods=len(data['series_value'][0]), freq=frequency)

    fig, axs = plt.subplots(n_clusters, 1, figsize=(14, 250))

    # Проходим по всем кластерам
    for i in range(n_clusters):
        # Выбираем данные, относящиеся к текущему кластеру
        cluster_data = data[data['cluster'] == i]
        
        for index, row in cluster_data.iterrows():
            series = row['series_value']
            axs[i].plot(date_range, series)
    
        axs[i].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        #axs[i].ticklabel_format(style='plain', axis='y')
        axs[i].set_title(f'Кластер {i}')
        axs[i].set_xlabel('Дата')
        axs[i].set_ylabel('Количество')
        axs[i].grid(True)
    
    plt.tight_layout()

    plt.show()