In [0]:
from si.preprocessing import (
    positive_sum_of_changes,
    negative_sum_of_changes,
    SumColumnsTransformer,
)
from si.pipeline import (
    WindowTransformer,
    FeatureTransformer,
    FillMissingValues,
    back_propagate_labels,
)
import tsfresh.feature_extraction.feature_calculators as fc

from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import pandas_udf
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from sklearn.cluster import KMeans
from matplotlib import pyplot
import seaborn as sns



schema = "aisdk"
table_name = "state_identifier"

spark.sql(f"USE {schema}")

df = spark.table(f"{schema}.{table_name}")
# df.show()

df = df.toPandas()

input_columns = ["ph1", "ph2", "ph3"]
df["ph_sum"] = SumColumnsTransformer().transform(df[input_columns].values).flatten()


weighted_feature_list = [
    (2, [fc.maximum, fc.minimum, fc.mean]),
    (1, [fc.variance, fc.standard_deviation]),
    (1, [fc.sum_values]),
    (1, [fc.absolute_sum_of_changes]),
    (1, [positive_sum_of_changes, negative_sum_of_changes]),
    (
        1,
        [
            fc.count_above_mean,
            fc.longest_strike_above_mean,
            fc.longest_strike_below_mean,
        ],
    ),
]

pipe = Pipeline(
        [
            (
                "preprocessing",
                Pipeline(
                    [
                        ("fillmissing", FillMissingValues("ffill")),
                        ("summarization", SumColumnsTransformer()),
                        ("windowing", WindowTransformer(window_size=300, window_step=300)),
                        ("featurization", FeatureTransformer(function_list=weighted_feature_list)),
                        ("scaling", MinMaxScaler(feature_range=(0, 1))),
                    ]
                ),
            ),
            ("clustering", KMeans(n_clusters=3, random_state=0)),
        ]
    )

x = df[input_columns].values  # transforming training data
pipe.fit(x)
x_classes = pipe.predict(x)
df = back_propagate_labels(df, pipe["preprocessing"], x_classes)

colormap = {
        -1: "white",
        0: "red",
        1: "green",
        2: "blue",
        3: "orange",
        4: "purple",
        5: "yellow",
    }
_, ax = pyplot.subplots(figsize=(24, 12))
sns.scatterplot(
    x=df.index, y="ph_sum", data=df, hue="class", palette=colormap, ax=ax
)



In [0]:
import mlflow
from mlflow.models import infer_signature

mlflow.set_registry_uri("databricks")
registered_name = "aisdk_model_state_identifier"
req_file = 'requirements.txt'

input_example = df[input_columns].head(300)

# predictions for signature inference
pred_array = pipe.predict(input_example.values)
pred_df = pd.DataFrame(pred_array, columns=["prediction"])

# infer signature
signature = infer_signature(input_example, pred_df)

with mlflow.start_run():
    mlflow.sklearn.log_model(
        sk_model=pipe,
        artifact_path="model",
        registered_model_name=registered_name,
        input_example=input_example,
        signature=signature,
        pip_requirements=req_file
    )