In [14]:
from pyspark.sql import SparkSession
import mlflow

spark = (
    SparkSession.builder.config("spark.jars.packages", "org.mlflow:mlflow-spark:1.11.0")
    .master("local[*]")
    .getOrCreate()
)
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.pyspark.ml.autolog()

spark

In [15]:
df = spark.createDataFrame(
    [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')],
    ['id', 'category'], 
)
df.show()

+---+--------+
| id|category|
+---+--------+
|  0|       a|
|  1|       b|
|  2|       c|
|  3|       a|
|  4|       a|
|  5|       c|
+---+--------+



In [16]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='category', outputCol='category_index')
indexer_model = indexer.fit(df)
indexer_model

2023/06/06 00:28:11 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'dbaf062e19d849f398302591de8ddc3e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


StringIndexerModel: uid=StringIndexer_6fd0a5b26d7a, handleInvalid=error

In [17]:
indexed_df = indexer_model.transform(df)
indexed_df, indexed_df.show()

+---+--------+--------------+
| id|category|category_index|
+---+--------+--------------+
|  0|       a|           0.0|
|  1|       b|           2.0|
|  2|       c|           1.0|
|  3|       a|           0.0|
|  4|       a|           0.0|
|  5|       c|           1.0|
+---+--------+--------------+



(DataFrame[id: bigint, category: string, category_index: double], None)

In [18]:
from pyspark.ml.feature import IndexToString

converter = IndexToString(inputCol='category_index', outputCol='original_category')
converted = converter.transform(indexed_df)
converted.show()

+---+--------+--------------+-----------------+
| id|category|category_index|original_category|
+---+--------+--------------+-----------------+
|  0|       a|           0.0|                a|
|  1|       b|           2.0|                b|
|  2|       c|           1.0|                c|
|  3|       a|           0.0|                a|
|  4|       a|           0.0|                a|
|  5|       c|           1.0|                c|
+---+--------+--------------+-----------------+



In [19]:
df = spark.createDataFrame(
    [(0, 'a', 'A'), (1, 'b', 'A'), (2, 'c', 'K'), (3, 'a', 'D'), (4, 'a', 'C'), (5, 'c', 'B')],
    ['id', 'category1', 'category2'])
df.show()

+---+---------+---------+
| id|category1|category2|
+---+---------+---------+
|  0|        a|        A|
|  1|        b|        A|
|  2|        c|        K|
|  3|        a|        D|
|  4|        a|        C|
|  5|        c|        B|
+---+---------+---------+



In [20]:
indexer = StringIndexer(inputCols=['category1', 'category2'], outputCols=['label_encoded1', 'label_encoded2'])
indexed_model = indexer.fit(df)
indexed_df = indexed_model.transform(df)
indexed_df.show()

2023/06/06 00:28:14 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '7416f2b3d2ea4920943a8e041fedc3d5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow
                                                                                

+---+---------+---------+--------------+--------------+
| id|category1|category2|label_encoded1|label_encoded2|
+---+---------+---------+--------------+--------------+
|  0|        a|        A|           0.0|           0.0|
|  1|        b|        A|           2.0|           0.0|
|  2|        c|        K|           1.0|           4.0|
|  3|        a|        D|           0.0|           3.0|
|  4|        a|        C|           0.0|           2.0|
|  5|        c|        B|           1.0|           1.0|
+---+---------+---------+--------------+--------------+



In [21]:
from pyspark.ml.feature import OneHotEncoder

df = spark.createDataFrame([
        (0.0, 1.0),
        (1.0, 0.0),
        (2.0, 1.0),
        (0.0, 2.0),
        (0.0, 1.0),
        (2.0, 0.0)
    ],
    ['categoryIndex1', 'categoryIndex2']
)
encoder = OneHotEncoder(
    dropLast=True, 
    inputCols=['categoryIndex1', 'categoryIndex2'],
    outputCols=['onehot_encoded1', 'onehot_encoded2'],
)
encoder_model = encoder.fit(df)
encoded_df = encoder_model.transform(df)
encoded_df, encoded_df.show()

2023/06/06 00:28:16 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5dfa5db91d034a4dbc1d2ec224ab4006', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


+--------------+--------------+---------------+---------------+
|categoryIndex1|categoryIndex2|onehot_encoded1|onehot_encoded2|
+--------------+--------------+---------------+---------------+
|           0.0|           1.0|  (2,[0],[1.0])|  (2,[1],[1.0])|
|           1.0|           0.0|  (2,[1],[1.0])|  (2,[0],[1.0])|
|           2.0|           1.0|      (2,[],[])|  (2,[1],[1.0])|
|           0.0|           2.0|  (2,[0],[1.0])|      (2,[],[])|
|           0.0|           1.0|  (2,[0],[1.0])|  (2,[1],[1.0])|
|           2.0|           0.0|      (2,[],[])|  (2,[0],[1.0])|
+--------------+--------------+---------------+---------------+



(DataFrame[categoryIndex1: double, categoryIndex2: double, onehot_encoded1: vector, onehot_encoded2: vector],
 None)

In [22]:
# error: onehot encoded can only be applied to numeric category value
df = spark.createDataFrame(
    [(0, 'a', 'A'), (1, 'b', 'A'), (2, 'c', 'K'), (3, 'a', 'D'), (4, 'a', 'C'), (5, 'c', 'B')],
    ['id', 'category1', 'category2']
)
df.show()
encoder = OneHotEncoder(inputCols=['category1', 'category2'], outputCols=['onehot_encoded1', 'onehot_encoded2'])
encoded_model = encoder.fit(df)

+---+---------+---------+
| id|category1|category2|
+---+---------+---------+
|  0|        a|        A|
|  1|        b|        A|
|  2|        c|        K|
|  3|        a|        D|
|  4|        a|        C|
|  5|        c|        B|
+---+---------+---------+



2023/06/06 00:28:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b74a552072e446b3a80810640f088e1c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


IllegalArgumentException: requirement failed: Column category1 must be of type numeric but was actually of type string.

In [None]:
df = spark.createDataFrame(
    [(0, 'a', 'A'), (1, 'b', 'A'), (2, 'c', 'K'), (3, 'a', 'D'), (4, 'a', 'C'), (5, 'c', 'B')],
    ['id', 'category1', 'category2']
)
label_encoder = StringIndexer(inputCols=['category1', 'category2'], outputCols=['label_encoded1', 'label_encoded2'])
label_encoded_df = label_encoder.fit(df).transform(df)
onehot_encoder = OneHotEncoder(inputCols=['label_encoded1', 'label_encoded2'], outputCols=['onehot_encoded1', 'onehot_encoded2'])
onehot_encoded_df = onehot_encoder.fit(label_encoded_df).transform(label_encoded_df)
onehot_encoded_df.show()

2023/06/05 23:57:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '182c5f821f5345dfaea270ae52ba4380', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow
2023/06/05 23:57:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5261b9bee2804921b5a11c71499d5f2b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


+---+---------+---------+--------------+--------------+---------------+---------------+
| id|category1|category2|label_encoded1|label_encoded2|onehot_encoded1|onehot_encoded2|
+---+---------+---------+--------------+--------------+---------------+---------------+
|  0|        a|        A|           0.0|           0.0|  (2,[0],[1.0])|  (4,[0],[1.0])|
|  1|        b|        A|           2.0|           0.0|      (2,[],[])|  (4,[0],[1.0])|
|  2|        c|        K|           1.0|           4.0|  (2,[1],[1.0])|      (4,[],[])|
|  3|        a|        D|           0.0|           3.0|  (2,[0],[1.0])|  (4,[3],[1.0])|
|  4|        a|        C|           0.0|           2.0|  (2,[0],[1.0])|  (4,[2],[1.0])|
|  5|        c|        B|           1.0|           1.0|  (2,[1],[1.0])|  (4,[1],[1.0])|
+---+---------+---------+--------------+--------------+---------------+---------------+



In [25]:
from pyspark.ml import Pipeline

stage_1 = StringIndexer(inputCols=['category1', 'category2'], outputCols=['label_encoded1', 'label_encoded2'])
stage_2 = OneHotEncoder(inputCols=['label_encoded1', 'label_encoded2'], outputCols=['onehot_encoded1', 'onehot_encoded2'])

pipeline = Pipeline(stages=[stage_1, stage_2])
# pipeline_model = pipeline.fit(df)
onehot_encoded_df = pipeline.fit(df).transform(df)

onehot_encoded_df.show()

2023/06/06 00:46:39 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1306d74a610c4186aa119d4cc5178775', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow
23/06/06 00:46:40 WARN StringIndexerModel: Input column category1 does not exist during transformation. Skip StringIndexerModel for this column.
23/06/06 00:46:40 WARN StringIndexerModel: Input column category2 does not exist during transformation. Skip StringIndexerModel for this column.
23/06/06 00:46:42 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "mlflow-artifacts"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apac

+---+---------+---------+--------------+--------------+---------------+---------------+
| id|category1|category2|label_encoded1|label_encoded2|onehot_encoded1|onehot_encoded2|
+---+---------+---------+--------------+--------------+---------------+---------------+
|  0|        a|        A|           0.0|           0.0|  (2,[0],[1.0])|  (4,[0],[1.0])|
|  1|        b|        A|           2.0|           0.0|      (2,[],[])|  (4,[0],[1.0])|
|  2|        c|        K|           1.0|           4.0|  (2,[1],[1.0])|      (4,[],[])|
|  3|        a|        D|           0.0|           3.0|  (2,[0],[1.0])|  (4,[3],[1.0])|
|  4|        a|        C|           0.0|           2.0|  (2,[0],[1.0])|  (4,[2],[1.0])|
|  5|        c|        B|           1.0|           1.0|  (2,[1],[1.0])|  (4,[1],[1.0])|
+---+---------+---------+--------------+--------------+---------------+---------------+



                                                                                

In [26]:
# pipeline_model.stages

[StringIndexerModel: uid=StringIndexer_9b7adfe11335, handleInvalid=error, numInputCols=2, numOutputCols=2,
 OneHotEncoderModel: uid=OneHotEncoder_4dcf732c2798, dropLast=true, handleInvalid=error, numInputCols=2, numOutputCols=2]

In [36]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
iris_data, iris_label = iris.data, iris.target
iris_columns = list(map(lambda n: n.replace('al ', 'al_').replace(' (cm)', ''),iris.feature_names))
iris_pdf = pd.DataFrame(iris_data, columns=iris_columns)
iris_pdf['label'] = iris_label
iris_sdf = spark.createDataFrame(iris_pdf)
iris_sdf.show()

+------------+-----------+------------+-----------+-----+
|sepal_length|sepal_width|petal_length|petal_width|label|
+------------+-----------+------------+-----------+-----+
|         5.1|        3.5|         1.4|        0.2|    0|
|         4.9|        3.0|         1.4|        0.2|    0|
|         4.7|        3.2|         1.3|        0.2|    0|
|         4.6|        3.1|         1.5|        0.2|    0|
|         5.0|        3.6|         1.4|        0.2|    0|
|         5.4|        3.9|         1.7|        0.4|    0|
|         4.6|        3.4|         1.4|        0.3|    0|
|         5.0|        3.4|         1.5|        0.2|    0|
|         4.4|        2.9|         1.4|        0.2|    0|
|         4.9|        3.1|         1.5|        0.1|    0|
|         5.4|        3.7|         1.5|        0.2|    0|
|         4.8|        3.4|         1.6|        0.2|    0|
|         4.8|        3.0|         1.4|        0.1|    0|
|         4.3|        3.0|         1.1|        0.1|    0|
|         5.8|

In [37]:
from pyspark.ml.feature import StandardScaler
# error: must be a vectorized column
standard_scaler = StandardScaler(inputCol='sepal_length', outputCol='scaled_sepal_length')
standard_scaler_model = standard_scaler.fit(iris_sdf)
standard_scaled_df = standard_scaler_model.transform(iris_sdf)
standard_scaled_df.show()

2023/06/06 00:54:46 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ccf37601492a4605a5233327ff0bea59', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


IllegalArgumentException: requirement failed: Column sepal_length must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.DoubleType$:double.

In [38]:
from pyspark.ml.feature import VectorAssembler

vec_assembler = VectorAssembler(inputCols=['sepal_length'], outputCol='sepal_length_vector')
iris_sdf_vectorized = vec_assembler.transform(iris_sdf)
iris_sdf_vectorized.show()

+------------+-----------+------------+-----------+-----+-------------------+
|sepal_length|sepal_width|petal_length|petal_width|label|sepal_length_vector|
+------------+-----------+------------+-----------+-----+-------------------+
|         5.1|        3.5|         1.4|        0.2|    0|              [5.1]|
|         4.9|        3.0|         1.4|        0.2|    0|              [4.9]|
|         4.7|        3.2|         1.3|        0.2|    0|              [4.7]|
|         4.6|        3.1|         1.5|        0.2|    0|              [4.6]|
|         5.0|        3.6|         1.4|        0.2|    0|              [5.0]|
|         5.4|        3.9|         1.7|        0.4|    0|              [5.4]|
|         4.6|        3.4|         1.4|        0.3|    0|              [4.6]|
|         5.0|        3.4|         1.5|        0.2|    0|              [5.0]|
|         4.4|        2.9|         1.4|        0.2|    0|              [4.4]|
|         4.9|        3.1|         1.5|        0.1|    0|       

In [41]:
standard_scaler = StandardScaler(inputCol='sepal_length_vector', outputCol='scaled_sepal_length_vector')
standard_scaler_model = standard_scaler.fit(iris_sdf_vectorized)
standard_scaled_df = standard_scaler_model.transform(iris_sdf_vectorized)
standard_scaled_df.show()

2023/06/06 00:59:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '31cc1d1835ff4fdeaf629134bce7008d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


+------------+-----------+------------+-----------+-----+-------------------+--------------------------+
|sepal_length|sepal_width|petal_length|petal_width|label|sepal_length_vector|scaled_sepal_length_vector|
+------------+-----------+------------+-----------+-----+-------------------+--------------------------+
|         5.1|        3.5|         1.4|        0.2|    0|              [5.1]|       [6.158928408838794]|
|         4.9|        3.0|         1.4|        0.2|    0|              [4.9]|      [5.9174018045706065]|
|         4.7|        3.2|         1.3|        0.2|    0|              [4.7]|       [5.675875200302419]|
|         4.6|        3.1|         1.5|        0.2|    0|              [4.6]|       [5.555111898168324]|
|         5.0|        3.6|         1.4|        0.2|    0|              [5.0]|         [6.0381651067047]|
|         5.4|        3.9|         1.7|        0.4|    0|              [5.4]|       [6.521218315241077]|
|         4.6|        3.4|         1.4|        0.3|    

In [42]:
standard_scaler = StandardScaler(inputCol='sepal_length_vector', outputCol='scaled_sepal_length_vector', withMean=True, withStd=True)
standard_scaler_model = standard_scaler.fit(iris_sdf_vectorized)
standard_scaled_df = standard_scaler_model.transform(iris_sdf_vectorized)
standard_scaled_df.show()

2023/06/06 01:00:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '877e45e99d114e95a86695d7ecd653b5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


+------------+-----------+------------+-----------+-----+-------------------+--------------------------+
|sepal_length|sepal_width|petal_length|petal_width|label|sepal_length_vector|scaled_sepal_length_vector|
+------------+-----------+------------+-----------+-----+-------------------+--------------------------+
|         5.1|        3.5|         1.4|        0.2|    0|              [5.1]|      [-0.8976738791967...|
|         4.9|        3.0|         1.4|        0.2|    0|              [4.9]|      [-1.1392004834649...|
|         4.7|        3.2|         1.3|        0.2|    0|              [4.7]|      [-1.3807270877331...|
|         4.6|        3.1|         1.5|        0.2|    0|              [4.6]|      [-1.5014903898672...|
|         5.0|        3.6|         1.4|        0.2|    0|              [5.0]|       [-1.01843718133086]|
|         5.4|        3.9|         1.7|        0.4|    0|              [5.4]|      [-0.5353839727944...|
|         4.6|        3.4|         1.4|        0.3|    

In [43]:
vec_assembler = VectorAssembler(inputCols=iris_columns, outputCol='features')
standard_scaler = StandardScaler(inputCol='features', outputCol='standard_scaled_features', withMean=True, withStd=True)

iris_sdf_vectorized = vec_assembler.transform(iris_sdf)
standard_scaled_df = standard_scaler.fit(iris_sdf_vectorized).transform(iris_sdf_vectorized)
standard_scaled_df.show(truncate=False)

2023/06/06 01:06:09 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e5d4e209a7a44c42b8614fa61fe1cee4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


+------------+-----------+------------+-----------+-----+-----------------+---------------------------------------------------------------------------------+
|sepal_length|sepal_width|petal_length|petal_width|label|features         |standard_scaled_features                                                         |
+------------+-----------+------------+-----------+-----+-----------------+---------------------------------------------------------------------------------+
|5.1         |3.5        |1.4         |0.2        |0    |[5.1,3.5,1.4,0.2]|[-0.8976738791967663,1.015601990713633,-1.33575163424152,-1.3110521482051305]    |
|4.9         |3.0        |1.4         |0.2        |0    |[4.9,3.0,1.4,0.2]|[-1.1392004834649536,-0.1315388120502606,-1.33575163424152,-1.3110521482051305]  |
|4.7         |3.2        |1.3         |0.2        |0    |[4.7,3.2,1.3,0.2]|[-1.3807270877331417,0.3273175090552973,-1.3923992862449774,-1.3110521482051305] |
|4.6         |3.1        |1.5         |0.2        |0

In [44]:
pipeline = Pipeline(stages=[vec_assembler, standard_scaler])
standard_scaled_df = pipeline.fit(iris_sdf).transform(iris_sdf)
standard_scaled_df.show(truncate=False)

2023/06/06 01:07:19 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b3fbb1874a544aeb860c45a27be26a0b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow
23/06/06 01:07:20 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "mlflow-artifacts"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util

+------------+-----------+------------+-----------+-----+-----------------+---------------------------------------------------------------------------------+
|sepal_length|sepal_width|petal_length|petal_width|label|features         |standard_scaled_features                                                         |
+------------+-----------+------------+-----------+-----+-----------------+---------------------------------------------------------------------------------+
|5.1         |3.5        |1.4         |0.2        |0    |[5.1,3.5,1.4,0.2]|[-0.8976738791967663,1.015601990713633,-1.33575163424152,-1.3110521482051305]    |
|4.9         |3.0        |1.4         |0.2        |0    |[4.9,3.0,1.4,0.2]|[-1.1392004834649536,-0.1315388120502606,-1.33575163424152,-1.3110521482051305]  |
|4.7         |3.2        |1.3         |0.2        |0    |[4.7,3.2,1.3,0.2]|[-1.3807270877331417,0.3273175090552973,-1.3923992862449774,-1.3110521482051305] |
|4.6         |3.1        |1.5         |0.2        |0

In [46]:
from pyspark.ml.feature import MinMaxScaler

vec_assembler = VectorAssembler(inputCols=iris_columns, outputCol='features')
iris_sdf_vectorized = vec_assembler.transform(iris_sdf)
minmax_scaler = MinMaxScaler(inputCol='features', outputCol='minmax_scaled_features')
minmax_scaled_df = minmax_scaler.fit(iris_sdf_vectorized).transform(iris_sdf_vectorized)
minmax_scaled_df.show(truncate=False)

2023/06/06 01:10:19 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6516a275ba7f47ceb2b947d3e5278c37', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


+------------+-----------+------------+-----------+-----+-----------------+----------------------------------------------------------------------------------+
|sepal_length|sepal_width|petal_length|petal_width|label|features         |minmax_scaled_features                                                            |
+------------+-----------+------------+-----------+-----+-----------------+----------------------------------------------------------------------------------+
|5.1         |3.5        |1.4         |0.2        |0    |[5.1,3.5,1.4,0.2]|[0.22222222222222213,0.625,0.06779661016949151,0.04166666666666667]               |
|4.9         |3.0        |1.4         |0.2        |0    |[4.9,3.0,1.4,0.2]|[0.1666666666666668,0.41666666666666663,0.06779661016949151,0.04166666666666667]  |
|4.7         |3.2        |1.3         |0.2        |0    |[4.7,3.2,1.3,0.2]|[0.11111111111111119,0.5,0.05084745762711865,0.04166666666666667]                 |
|4.6         |3.1        |1.5         |0.2    

In [47]:
pipeline = Pipeline(stages=[vec_assembler, minmax_scaler])
minmax_scaled_df = pipeline.fit(iris_sdf).transform(iris_sdf)
minmax_scaled_df.show(truncate=False)

2023/06/06 01:11:36 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a93a20263c494ca9970524f2a34c0a32', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow
23/06/06 01:11:37 ERROR Instrumentation: org.apache.hadoop.fs.UnsupportedFileSystemException: No FileSystem for scheme "mlflow-artifacts"
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:3443)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3466)
	at org.apache.hadoop.fs.FileSystem.access$300(FileSystem.java:174)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3574)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3521)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:540)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:673)
	at org.apache.spark.ml.util

+------------+-----------+------------+-----------+-----+-----------------+----------------------------------------------------------------------------------+
|sepal_length|sepal_width|petal_length|petal_width|label|features         |minmax_scaled_features                                                            |
+------------+-----------+------------+-----------+-----+-----------------+----------------------------------------------------------------------------------+
|5.1         |3.5        |1.4         |0.2        |0    |[5.1,3.5,1.4,0.2]|[0.22222222222222213,0.625,0.06779661016949151,0.04166666666666667]               |
|4.9         |3.0        |1.4         |0.2        |0    |[4.9,3.0,1.4,0.2]|[0.1666666666666668,0.41666666666666663,0.06779661016949151,0.04166666666666667]  |
|4.7         |3.2        |1.3         |0.2        |0    |[4.7,3.2,1.3,0.2]|[0.11111111111111119,0.5,0.05084745762711865,0.04166666666666667]                 |
|4.6         |3.1        |1.5         |0.2    