## PySpark2PMML
### Python package for converting Apache Spark ML pipelines to PMML.
### https://github.com/jpmml/pyspark2pmml

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import RFormula
from pyspark2pmml import PMMLBuilder


In [2]:
# Specify additional jars for Spark jobs
spark_jars = "../jars/*"

spark_packages = [
    'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2',
    'org.apache.kafka:kafka-clients:3.2.3',
    'org.jpmml:pmml-sparkml:2.2.0',
    'org.jpmml:pmml-sparkml-lightgbm:2.2.0',
    'org.jpmml:pmml-sparkml-xgboost:2.2.0'
]

spark = SparkSession.builder \
    .appName("PMML for Spark-ML") \
    .master("local[*]") \
    .config("spark.driver.host", "localhost") \
    .config("spark.driver.extraClassPath", spark_jars) \
    .config("spark.executor.extraClassPath", spark_jars) \
    .config("spark.jars.packages", ",".join(spark_packages)) \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

In [3]:
# Load dataset
df = spark.read.format("csv") \
    .option("inferSchema", "true") \
    .option("header","true") \
    .load("../data/iris.csv")

df.show()

+------------+-----------+------------+-----------+-------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [4]:
formula = RFormula(formula = "Species ~ .")
classifier = DecisionTreeClassifier()
pipeline = Pipeline(stages = [formula, classifier])
pipelineModel = pipeline.fit(df)

In [5]:
classifierModel = pipelineModel.stages[1]

pmmlBuilder = PMMLBuilder(spark.sparkContext, df, pipelineModel) \
    .putOption(classifierModel, "compact", False) \
    .putOption(classifierModel, "estimate_featureImportances", True)

pmmlBuilder.buildFile("../data/DecisionTreeIris.pmml")

'/Users/Shared/dmmil/development/jupyter-app/jupyter-app/python-libararies/../data/DecisionTreeIris.pmml'

In [6]:
spark.stop()