In [0]:
import sys
print(sys.version)

In [0]:

import sys
sys.path.append("/Workspace/Users/djaci.araujo@siemens.com/databricks-aisdk/mlops")

from si.preprocessing import (
    positive_sum_of_changes,
    negative_sum_of_changes,
    SumColumnsTransformer,
)
import tsfresh.feature_extraction.feature_calculators as fc

from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import pandas_udf
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from sklearn.cluster import KMeans
from state_identifier.src.si.pipeline import (
    WindowTransformer,
    FeatureTransformer,
    FillMissingValues,
    back_propagate_labels,
)
from matplotlib import pyplot
import seaborn as sns



@pandas_udf(DoubleType())
def sumcols_udf(*cols: pd.Series) -> pd.Series:
    # cols is a list of pandas Series; build a pandas DataFrame
    data = pd.concat(cols, axis=1)
    # transformer.transform returns 2D array; ravel to 1D
    return pd.Series(transformer.transform(data).ravel())


schema = "aisdk"
table_name = "state_identifier"

spark.sql(f"USE {schema}")

df = spark.table(f"{schema}.{table_name}")
# df.show()

df = df.toPandas()

input_columns = ["ph1", "ph2", "ph3"]
# pdf = df[input_columns].toPandas()
df["ph_sum"] = SumColumnsTransformer().transform(df[input_columns].values).flatten()

# df = df.withColumn("ph_sum", sumcols_udf(*[F.col(c) for c in input_columns]))

weighted_feature_list = [
    (2, [fc.maximum, fc.minimum, fc.mean]),
    (1, [fc.variance, fc.standard_deviation]),
    (1, [fc.sum_values]),
    (1, [fc.absolute_sum_of_changes]),
    (1, [positive_sum_of_changes, negative_sum_of_changes]),
    (
        1,
        [
            fc.count_above_mean,
            fc.longest_strike_above_mean,
            fc.longest_strike_below_mean,
        ],
    ),
]

pipe = Pipeline(
        [
            (
                "preprocessing",
                Pipeline(
                    [
                        ("fillmissing", FillMissingValues("ffill")),
                        (
                            "summarization",
                            SumColumnsTransformer(),
                        ),  # summarizes the variables into one variable
                        (
                            "windowing",
                            WindowTransformer(window_size=300, window_step=300),
                        ),
                        (
                            "featurization",
                            FeatureTransformer(function_list=weighted_feature_list),
                        ),
                        ("scaling", MinMaxScaler(feature_range=(0, 1))),
                    ]
                ),
            ),
            ("clustering", KMeans(n_clusters=3, random_state=0)),
        ]
    )

x = df[input_columns].values  # transforming training data
pipe.fit(x)
x_classes = pipe.predict(x)
df = back_propagate_labels(df, pipe["preprocessing"], x_classes)

colormap = {
        -1: "white",
        0: "red",
        1: "green",
        2: "blue",
        3: "orange",
        4: "purple",
        5: "yellow",
    }
_, ax = pyplot.subplots(figsize=(24, 12))
sns.scatterplot(
    x=df.index, y="ph_sum", data=df, hue="class", palette=colormap, ax=ax
)







In [0]:
import mlflow
from mlflow.models import infer_signature

mlflow.set_registry_uri("databricks")
registered_name = "aisdk_model_state_identifier"
req_file = '/Workspace/Users/djaci.araujo@siemens.com/databricks-aisdk/mlops/state_identifier/src/requirements.txt'

# X_example = df[input_columns].head(5)
# sig = infer_signature(X_example, pipe.predict(X_example))

with mlflow.start_run():
    mlflow.sklearn.log_model(
        sk_model=pipe,
        artifact_path="model",
        registered_model_name=registered_name,
        # input_example=X_example.iloc[:1],
        # signature=sig,
        pip_requirements=req_file,
        # If the pipeline references custom classes (e.g., SumColumnsTransformer),
        # make sure that module is importable at serve time:
        # code_paths=["/Workspace/Repos/you/project/src"]  # or infer_code_paths=True
    )