In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

In [2]:
iris = load_iris()
iris_data = iris.data
iris_label = iris.target

type(iris_data), iris_data.shape, type(iris_label), iris_label.shape, iris.feature_names

(numpy.ndarray,
 (150, 4),
 numpy.ndarray,
 (150,),
 ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'])

In [3]:
iris_columns = list(map(lambda x: x.replace(' (cm)', '').replace('al ', 'al_'), iris.feature_names))
iris_columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [4]:
iris_pdf = pd.DataFrame(iris_data, columns=iris_columns)
iris_pdf['target'] = iris_label
iris_pdf.target.value_counts(), iris_pdf

(target
 0    50
 1    50
 2    50
 Name: count, dtype: int64,
      sepal_length  sepal_width  petal_length  petal_width  target
 0             5.1          3.5           1.4          0.2       0
 1             4.9          3.0           1.4          0.2       0
 2             4.7          3.2           1.3          0.2       0
 3             4.6          3.1           1.5          0.2       0
 4             5.0          3.6           1.4          0.2       0
 ..            ...          ...           ...          ...     ...
 145           6.7          3.0           5.2          2.3       2
 146           6.3          2.5           5.0          1.9       2
 147           6.5          3.0           5.2          2.0       2
 148           6.2          3.4           5.4          2.3       2
 149           5.9          3.0           5.1          1.8       2
 
 [150 rows x 5 columns])

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=11)
dt_clf = DecisionTreeClassifier(random_state=11, max_depth=5)
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
print('pred:', pred)

pred: [2 2 1 1 2 0 1 0 0 1 1 1 1 2 2 0 2 1 2 2 1 0 0 1 0 0 2 1 0 1]


In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/05 05:42:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/06/05 05:42:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [7]:
iris_sdf = spark.createDataFrame(iris_pdf)
type(iris_sdf), iris_sdf.limit(10).show()

                                                                                

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width|target|
+------------+-----------+------------+-----------+------+
|         5.1|        3.5|         1.4|        0.2|     0|
|         4.9|        3.0|         1.4|        0.2|     0|
|         4.7|        3.2|         1.3|        0.2|     0|
|         4.6|        3.1|         1.5|        0.2|     0|
|         5.0|        3.6|         1.4|        0.2|     0|
|         5.4|        3.9|         1.7|        0.4|     0|
|         4.6|        3.4|         1.4|        0.3|     0|
|         5.0|        3.4|         1.5|        0.2|     0|
|         4.4|        2.9|         1.4|        0.2|     0|
|         4.9|        3.1|         1.5|        0.1|     0|
+------------+-----------+------------+-----------+------+



(pyspark.sql.dataframe.DataFrame, None)

In [8]:
train_sdf, test_sdf = iris_sdf.randomSplit([0.8, 0.2], seed=42)
train_sdf.cache()
iris_sdf.count(), train_sdf.count(), test_sdf.count()

                                                                                

(150, 125, 25)

In [9]:
train_sdf.show()

+------------+-----------+------------+-----------+------+
|sepal_length|sepal_width|petal_length|petal_width|target|
+------------+-----------+------------+-----------+------+
|         4.6|        3.1|         1.5|        0.2|     0|
|         4.7|        3.2|         1.3|        0.2|     0|
|         5.0|        3.6|         1.4|        0.2|     0|
|         5.1|        3.5|         1.4|        0.2|     0|
|         5.4|        3.9|         1.7|        0.4|     0|
|         4.6|        3.4|         1.4|        0.3|     0|
|         4.8|        3.4|         1.6|        0.2|     0|
|         4.9|        3.1|         1.5|        0.1|     0|
|         5.0|        3.4|         1.5|        0.2|     0|
|         4.3|        3.0|         1.1|        0.1|     0|
|         5.1|        3.5|         1.4|        0.3|     0|
|         5.4|        3.9|         1.3|        0.4|     0|
|         5.7|        4.4|         1.5|        0.4|     0|
|         5.8|        4.0|         1.2|        0.2|     

In [10]:
from pyspark.ml.feature import VectorAssembler

vec_assembler = VectorAssembler(inputCols=iris_columns, outputCol='features')
train_feautre_vector_df = vec_assembler.transform(train_sdf)
type(train_feautre_vector_df), train_feautre_vector_df.show(), train_feautre_vector_df.printSchema()

+------------+-----------+------------+-----------+------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features|
+------------+-----------+------------+-----------+------+-----------------+
|         4.6|        3.1|         1.5|        0.2|     0|[4.6,3.1,1.5,0.2]|
|         4.7|        3.2|         1.3|        0.2|     0|[4.7,3.2,1.3,0.2]|
|         5.0|        3.6|         1.4|        0.2|     0|[5.0,3.6,1.4,0.2]|
|         5.1|        3.5|         1.4|        0.2|     0|[5.1,3.5,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4|     0|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3|     0|[4.6,3.4,1.4,0.3]|
|         4.8|        3.4|         1.6|        0.2|     0|[4.8,3.4,1.6,0.2]|
|         4.9|        3.1|         1.5|        0.1|     0|[4.9,3.1,1.5,0.1]|
|         5.0|        3.4|         1.5|        0.2|     0|[5.0,3.4,1.5,0.2]|
|         4.3|        3.0|         1.1|        0.1|     0|[4.3,3.0,1.1,0.1]|

(pyspark.sql.dataframe.DataFrame, None, None)

In [11]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol='features', labelCol='target', maxDepth=5)
dt_model = dt.fit(train_feautre_vector_df)
type(dt), dt, type(dt_model), dt_model

(pyspark.ml.classification.DecisionTreeClassifier,
 DecisionTreeClassifier_c63967842092,
 pyspark.ml.classification.DecisionTreeClassificationModel,
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_c63967842092, depth=5, numNodes=13, numClasses=3, numFeatures=4)

In [12]:
test_feature_vector_df = vec_assembler.transform(test_sdf)
pred = dt_model.transform(test_feature_vector_df)
type(pred), pred.show()

+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|         4.9|        3.0|         1.4|        0.2|     0|[4.9,3.0,1.4,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.4|        2.9|         1.4|        0.2|     0|[4.4,2.9,1.4,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         5.4|        3.7|         1.5|        0.2|     0|[5.4,3.7,1.5,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.8|        3.0|         1.4|        0.1|     0|[4.8,3.0,1.4,0.1]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         5.0|        3.2|         1.2|        0.2|     0|[5.0,3.2,1.2,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         5.5|        4.2|         1.4|        0.2|     0|[5.5,4

                                                                                

(pyspark.sql.dataframe.DataFrame, None)

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='target', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator_accuracy.evaluate(pred)
accuracy

                                                                                

0.96

In [14]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features', labelCol='target', maxIter=10)

lr_model = lr.fit(train_feautre_vector_df)
preds = lr_model.transform(test_feature_vector_df)
preds.show(truncate=False)

23/06/05 05:42:55 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

+------------+-----------+------------+-----------+------+-----------------+-----------------------------------------------------------+-----------------------------------------------------------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|features         |rawPrediction                                              |probability                                                      |prediction|
+------------+-----------+------------+-----------+------+-----------------+-----------------------------------------------------------+-----------------------------------------------------------------+----------+
|4.9         |3.0        |1.4         |0.2        |0     |[4.9,3.0,1.4,0.2]|[15.48225282549579,8.023363630935137,-23.50561645643093]   |[0.9994240359569829,5.759640430169748E-4,1.1682433017405761E-17] |0.0       |
|4.4         |2.9        |1.4         |0.2        |0     |[4.4,2.9,1.4,0.2]|[16.842849444821244,6.515549924162652,-23.3583993689839]   |[0.99996

In [15]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='target', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator_accuracy.evaluate(preds)
accuracy

0.96

In [18]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

In [19]:
iris_columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

In [22]:
stage_1 = VectorAssembler(inputCols=iris_columns, outputCol='features')
stage_2 = DecisionTreeClassifier(featuresCol='features', labelCol='target', maxDepth=5)

pipeline = Pipeline(stages=[stage_1, stage_2])
pipeline_model = pipeline.fit(train_sdf)
type(pipeline), type(pipeline_model)

(pyspark.ml.pipeline.Pipeline, pyspark.ml.pipeline.PipelineModel)

In [23]:
predictions = pipeline_model.transform(test_sdf)
predictions.show()

+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|         4.9|        3.0|         1.4|        0.2|     0|[4.9,3.0,1.4,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.4|        2.9|         1.4|        0.2|     0|[4.4,2.9,1.4,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         5.4|        3.7|         1.5|        0.2|     0|[5.4,3.7,1.5,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.8|        3.0|         1.4|        0.1|     0|[4.8,3.0,1.4,0.1]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         5.0|        3.2|         1.2|        0.2|     0|[5.0,3.2,1.2,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         5.5|        4.2|         1.4|        0.2|     0|[5.5,4

In [24]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol='target', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator_accuracy.evaluate(predictions)
accuracy

0.96

In [25]:
pipeline_model.stages

[VectorAssembler_ba1fbc117635,
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3ae85c253f5b, depth=5, numNodes=13, numClasses=3, numFeatures=4]

In [28]:
vec_assembler = pipeline_model.stages[0]
dt_model = pipeline_model.stages[-1]

vec_assembler, dt_model

(VectorAssembler_ba1fbc117635,
 DecisionTreeClassificationModel: uid=DecisionTreeClassifier_3ae85c253f5b, depth=5, numNodes=13, numClasses=3, numFeatures=4)

In [29]:
test_feature_vector_df = vec_assembler.transform(test_sdf)
predictions = dt_model.transform(test_feature_vector_df)
accuracy = evaluator_accuracy.evaluate(predictions)
predictions.show(), accuracy



+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|target|         features| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+------+-----------------+--------------+-------------+----------+
|         4.9|        3.0|         1.4|        0.2|     0|[4.9,3.0,1.4,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.4|        2.9|         1.4|        0.2|     0|[4.4,2.9,1.4,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         5.4|        3.7|         1.5|        0.2|     0|[5.4,3.7,1.5,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         4.8|        3.0|         1.4|        0.1|     0|[4.8,3.0,1.4,0.1]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         5.0|        3.2|         1.2|        0.2|     0|[5.0,3.2,1.2,0.2]|[42.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|         5.5|        4.2|         1.4|        0.2|     0|[5.5,4

                                                                                

(None, 0.96)