In [101]:
# Imports
import pandas as pd
from cdsgd import DSClustering
from os.path import join
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [102]:
LENGTH = 24
KEYS = 100

In [103]:
# Lectura de datos
PATH = "/home/erick/git/tesis-magister-dcc/data/processed/"

# series de tiempo
output_df = pd.read_csv(join(PATH, "output_df.csv"), index_col=0)

# etiquetas kmeans por cliente
labels = pd.read_csv(join(PATH, "labels_df.csv"), index_col=0)

# transformar datos serie de tiempo a matriz (y cortar)
data = (output_df[output_df["datetime"] <= "2023-07-02"]
 .merge(labels, on="key")
 .pivot_table(columns="datetime", values="value", index=["key", "label"], aggfunc="mean", fill_value=-1)
 .reset_index()
 )

# las llaves de la muestra
keys = data[["key"]]

# la etiqueta de kmeans es la variable a predecir
y_custom = data["label"]

# la matriz de features es la matriz de serie de tiempo
X_custom = data.drop(columns=["label", "key"])
X_custom.columns = [f"Hora_{i}" for i in range(0, LENGTH)]

In [104]:
# agregar informacion de Comuna y Alimentador por cliente como OneHotEncoder

# lectura de datos de comuna y alimentador
key_comuna = pd.read_csv(join(PATH, "key_comuna_feeder.csv"),
                         index_col=0, dtype={"nro_medidor": str}
                         ).rename(columns={"nro_medidor": "key"})

# agregar metodo a la clase (compatibilidad)
def set_output(self, *, transform=None):
    """Set output container.

    See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
    for an example on how to use the API.

    Parameters
    ----------
    transform : {"default", "pandas", "polars"}, default=None
        Configure output of `transform` and `fit_transform`.

        - `"default"`: Default output format of a transformer
        - `"pandas"`: DataFrame output
        - `"polars"`: Polars output
        - `None`: Transform configuration is unchanged

        .. versionadded:: 1.4
            `"polars"` option was added.

    Returns
    -------
    self : estimator instance
        Estimator instance.
    """
    if transform is None:
        return self

    if not hasattr(self, "_sklearn_output_config"):
        self._sklearn_output_config = {}

    self._sklearn_output_config["transform"] = transform
    return self

OneHotEncoder.set_output = set_output

# crear OneHotEncoder
encoder = OneHotEncoder(sparse=False).set_output(transform="pandas")

In [105]:
# modificar tipo de columna
keys["key"] = keys["key"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  keys["key"] = keys["key"].astype(str)


In [106]:
# OneHotEncoding de comuna y feeder para cada key
data_encoded = encoder.fit_transform(keys.merge(key_comuna, on="key", how="inner")[["comuna", "feeder"]])

In [107]:
## agregar a la matriz de caracteristicas el OneHotEncoding (simplemente unir lado a lado)
#X_custom = pd.concat([X_custom, pd.DataFrame(data_encoded)], axis=1, ignore_index=True)

In [108]:
print("DSClustering begin")
print(f"Dimensions: {X_custom.shape}")

# Instantiate DSClustering
# Form 1 - Default instantiation with just the feature matrix
# ds1 = DSClustering(X_custom, max_iter=1000)
# Form 2 - Instantiation with a parameter to consider the most voted features
# ds2 = DSClustering(X_custom, most_voted=True, max_iter=1000)
# Form 3 - Instantiation with a numeric parameter
ds3 = DSClustering(X_custom, 3, max_iter=1000)

# Apply the method to generate categorical rules
# ds1.generate_categorical_rules()  # Generate rules for the first instance
# ds2.generate_categorical_rules()  # Generate rules for the second instance
ds3.generate_categorical_rules()  # Generate rules for the third instance

# Apply the predict method (internally finalizes the classification model)
# labels1 = ds1.predict()  # Predict labels using the first set of rules
# labels2 = ds2.predict()  # Predict labels using the second set of rules
labels3 = ds3.predict()  # Predict labels using the third set of rules

# Apply the method to print the most important rules
# ds1.print_most_important_rules()  # Print rules from the first model
# ds2.print_most_important_rules()  # Print rules from the second model
ds3.print_most_important_rules()  # Print rules from the third model

# Apply the method to print metrics
# is needed to encode y_custom to be use in this method
ds1.metrics()  # Print metrics for the first model
ds2.metrics()  # Print metrics for the second model
ds3.metrics()  # Print metrics for the third model


DSClustering begin
Dimensions: (268, 24)
Optimization started
Processing epoch	32	0.2184	
Training time: 2.99s, epochs: 52

Least training loss reached: 0.006


Most important rules for Cluster 0

	[0.299] R602: Negative Hora_12 - 181.089, Hora_13 - 106.907
			0: 0.295	1: 0.000	2: 0.008	Unc: 0.697

	[0.299] R645: Positive Hora_13 - 106.907, Hora_22 - 183.864
			0: 0.293	1: 0.000	2: 0.011	Unc: 0.696

	[0.297] R355: Positive Hora_5 - 99.542, Hora_9 - 155.336
			0: 0.290	1: 0.012	2: 0.002	Unc: 0.696

	[0.296] R313: Positive Hora_4 - 92.741, Hora_9 - 155.336
			0: 0.289	1: 0.004	2: 0.011	Unc: 0.696

	[0.293] R747: Positive Hora_18 - 262.750, Hora_23 - 174.374
			0: 0.283	1: 0.005	2: 0.017	Unc: 0.695

	[0.293] R680: Negative Hora_15 - 131.467, Hora_16 - 118.201
			0: 0.284	1: 0.000	2: 0.018	Unc: 0.698

	[0.292] R611: Positive Hora_12 - 181.089, Hora_18 - 262.750
			0: 0.280	1: 0.002	2: 0.022	Unc: 0.695

	[0.292] R271: Positive Hora_3 - 84.430, Hora_10 - 124.201
			0: 0.280	1: 0.019	2: 0.005