<a href="https://colab.research.google.com/github/dsirt/Data-Engineering-Bootcamp/blob/main/DecisionTreeClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
#Decision Tree Classification
!pip install delta-spark==2.4.0


Collecting delta-spark==2.4.0
  Downloading delta_spark-2.4.0-py3-none-any.whl.metadata (1.9 kB)
Collecting pyspark<3.5.0,>=3.4.0 (from delta-spark==2.4.0)
  Downloading pyspark-3.4.4.tar.gz (311.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.4/311.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading delta_spark-2.4.0-py3-none-any.whl (20 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.4-py2.py3-none-any.whl size=311905460 sha256=49df0176ba3e140cb0fbacc113713ea216769259423fc3b261a3c15a8a5eac5d
  Stored in directory: /root/.cache/pip/wheels/6b/0a/a1/2b8f5f192c7df9fdceb8e5a62873d64e46b101f980519bcf55
Successfully built pyspark
Installing collected packages: pyspark, delta-spark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.4
    Uninstalling pyspark-3.5

In [19]:
from delta import *

In [1]:
from pyspark.sql import SparkSession

# Add Delta Lake package dependency
spark = SparkSession \
    .builder \
    .appName('DT Classification with Pyspark') \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Read the CSV file using the correct format
df = spark.read.format('csv').\
    options(header = 'true', inferschema = 'true').\
    load("/content/WineData.csv")

df.printSchema()
df.show(5)

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|          

In [2]:
def condition(r):

    if (0 <= r <= 4):
        label = 'low'

    elif (4 < r <= 6):
        label = 'medium'

    else:
        label = 'high'

    return label

def string_to_float(x):
    return float(x)

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, DoubleType
string_to_float_udf = udf(string_to_float, DoubleType())
quality_udf = udf(lambda x : condition(x), StringType())

df = df.withColumn("quality", quality_udf("quality"))
df.show(5)
df.printSchema()

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4| medium|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8| medium|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8| medium|
|         11.2|            0.28|       0.56|           1.9|    0.075|               17.0|           

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorIndexer, StringIndexer, IndexToString
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
def transData(data):
    return data.rdd.map(lambda r : [Vectors.dense(r[:-1]), r[-1]]).toDF(['features', 'label'])

transformed = transData(df)
transformed.show(5)

+--------------------+------+
|            features| label|
+--------------------+------+
|[7.4,0.7,0.0,1.9,...|medium|
|[7.8,0.88,0.0,2.6...|medium|
|[7.8,0.76,0.04,2....|medium|
|[11.2,0.28,0.56,1...|medium|
|[7.4,0.7,0.0,1.9,...|medium|
+--------------------+------+
only showing top 5 rows



In [6]:
from pyspark.ml.feature import VectorIndexer, StringIndexer, IndexToString
labelIndexer = StringIndexer(inputCol = 'label', outputCol = 'indexedLabel').fit(transformed)
labelIndexer.transform(transformed).show(5, True)

featureIndexer = VectorIndexer(inputCol = 'features', outputCol = 'indexedFeatures', maxCategories = 4).fit(transformed)
featureIndexer.transform(transformed).show(5)

+--------------------+------+------------+
|            features| label|indexedLabel|
+--------------------+------+------------+
|[7.4,0.7,0.0,1.9,...|medium|         0.0|
|[7.8,0.88,0.0,2.6...|medium|         0.0|
|[7.8,0.76,0.04,2....|medium|         0.0|
|[11.2,0.28,0.56,1...|medium|         0.0|
|[7.4,0.7,0.0,1.9,...|medium|         0.0|
+--------------------+------+------------+
only showing top 5 rows

+--------------------+------+--------------------+
|            features| label|     indexedFeatures|
+--------------------+------+--------------------+
|[7.4,0.7,0.0,1.9,...|medium|[7.4,0.7,0.0,1.9,...|
|[7.8,0.88,0.0,2.6...|medium|[7.8,0.88,0.0,2.6...|
|[7.8,0.76,0.04,2....|medium|[7.8,0.76,0.04,2....|
|[11.2,0.28,0.56,1...|medium|[11.2,0.28,0.56,1...|
|[7.4,0.7,0.0,1.9,...|medium|[7.4,0.7,0.0,1.9,...|
+--------------------+------+--------------------+
only showing top 5 rows



In [7]:
(trainingData, testData) = transformed.randomSplit([0.6, 0.4])
trainingData.show(5)
testData.show(5)

+--------------------+------+
|            features| label|
+--------------------+------+
|[4.6,0.52,0.15,2....|   low|
|[4.7,0.6,0.17,2.3...|medium|
|[4.9,0.42,0.0,2.1...|  high|
|[5.0,0.38,0.01,1....|medium|
|[5.0,0.4,0.5,4.3,...|medium|
+--------------------+------+
only showing top 5 rows

+--------------------+------+
|            features| label|
+--------------------+------+
|[5.0,0.42,0.24,2....|  high|
|[5.0,1.04,0.24,1....|medium|
|[5.1,0.42,0.0,1.8...|  high|
|[5.1,0.47,0.02,1....|medium|
|[5.1,0.51,0.18,2....|  high|
+--------------------+------+
only showing top 5 rows



In [8]:
from pyspark.ml.classification import DecisionTreeClassifier

dTree = DecisionTreeClassifier(labelCol = 'indexedLabel', featuresCol = 'indexedFeatures')

In [9]:
#Pipeline Architecture

labelConverter = IndexToString(inputCol = "prediction", outputCol = 'predictedLabel', labels = labelIndexer.labels)

pipeline = Pipeline(stages = [labelIndexer, featureIndexer, dTree, labelConverter])

model = pipeline.fit(trainingData)

predictions = model.transform(testData)

predictions.select('features', 'label', "predictedLabel").show(5)

+--------------------+------+--------------+
|            features| label|predictedLabel|
+--------------------+------+--------------+
|[5.0,0.42,0.24,2....|  high|        medium|
|[5.0,1.04,0.24,1....|medium|        medium|
|[5.1,0.42,0.0,1.8...|  high|          high|
|[5.1,0.47,0.02,1....|medium|        medium|
|[5.1,0.51,0.18,2....|  high|          high|
+--------------------+------+--------------+
only showing top 5 rows



In [10]:
#Evaluation

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol = 'indexedLabel', predictionCol = 'prediction', metricName = 'accuracy')
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

rfModel = model.stages[-2]
print(rfModel)

Test Error = 0.143345
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5939946d6e55, depth=5, numNodes=49, numClasses=3, numFeatures=11


In [11]:
from sklearn.metrics import confusion_matrix

y_true = predictions.select("label")
y_true = y_true.toPandas()

y_pred = predictions.select("predictedLabel")
y_pred = y_pred.toPandas()

cnf_matrix = confusion_matrix(y_true, y_pred)
cnf_matrix

array([[ 34,   0,  35],
       [  1,   0,  22],
       [ 26,   0, 468]])