In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=8efa6dc2717a98447d4cc87449a0f26342567b587b8d169c90612b60da605a96
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


# ***Decision Tree Classifier***

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from sklearn.metrics import accuracy_score
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from sklearn.metrics import accuracy_score
from pyspark.sql import SparkSession
import pandas as pd

In [None]:
spark = SparkSession.builder.appName('ml-bank').getOrCreate()
df = spark.read.csv('genders.csv', header = True, inferSchema = True)
df.printSchema()
cols = df.columns

root
 |-- long_hair: integer (nullable = true)
 |-- forehead_width_cm: double (nullable = true)
 |-- forehead_height_cm: double (nullable = true)
 |-- nose_wide: integer (nullable = true)
 |-- nose_long: integer (nullable = true)
 |-- lips_thin: integer (nullable = true)
 |-- distance_nose_to_lip_long: integer (nullable = true)
 |-- gender: string (nullable = true)



In [None]:
gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_index")
df = gender_indexer.fit(df).transform(df)
df.show()

+------+--------------------+---------+-----------------+------------------+---------+---------+---------+-------------------------+------+------------+
|gender|      feature_vector|long_hair|forehead_width_cm|forehead_height_cm|nose_wide|nose_long|lips_thin|distance_nose_to_lip_long|gender|gender_index|
+------+--------------------+---------+-----------------+------------------+---------+---------+---------+-------------------------+------+------------+
|  Male|[1.0,11.8,6.1,1.0...|        1|             11.8|               6.1|        1|        0|        1|                        1|  Male|         1.0|
|Female|(7,[1,2,5],[14.0,...|        0|             14.0|               5.4|        0|        0|        1|                        0|Female|         0.0|
|  Male|[0.0,11.8,6.3,1.0...|        0|             11.8|               6.3|        1|        1|        1|                        1|  Male|         1.0|
|  Male|[0.0,14.4,6.1,0.0...|        0|             14.4|               6.1|      


*   **input variables: long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long**
*   **output variables: gender**


In [None]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

numericCols = ['long_hair','forehead_width_cm','forehead_height_cm','nose_wide',
               'nose_long','lips_thin','distance_nose_to_lip_long']
featurizationPipeline = Pipeline(stages = [VectorAssembler(inputCols=numericCols, outputCol="feature")])


In [None]:
from pyspark.ml import Pipeline

pipelineModel = featurizationPipeline.fit(df)
df = pipelineModel.transform(df)
selectedCols = ['gender_index', 'feature'] + cols
df = df.select(selectedCols)
df.printSchema()

root
 |-- gender_index: double (nullable = false)
 |-- feature: vector (nullable = true)
 |-- long_hair: integer (nullable = true)
 |-- forehead_width_cm: double (nullable = true)
 |-- forehead_height_cm: double (nullable = true)
 |-- nose_wide: integer (nullable = true)
 |-- nose_long: integer (nullable = true)
 |-- lips_thin: integer (nullable = true)
 |-- distance_nose_to_lip_long: integer (nullable = true)
 |-- gender: string (nullable = true)



In [None]:
train, test = df.randomSplit([0.7, 0.3], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 3539
Test Dataset Count: 1462


In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'feature', labelCol = 'gender_index', maxDepth = 3)
dtModel = dt.fit(train)
predictions = dtModel.transform(test)
predictions.select('long_hair', 'forehead_width_cm', 'gender', 'rawPrediction', 'prediction', 'probability').show(10)

+---------+-----------------+------+-------------+----------+--------------------+
|long_hair|forehead_width_cm|gender|rawPrediction|prediction|         probability|
+---------+-----------------+------+-------------+----------+--------------------+
|        1|             11.4|Female|[1373.0,20.0]|       0.0|[0.98564249820531...|
|        1|             11.4|Female|[1373.0,20.0]|       0.0|[0.98564249820531...|
|        1|             11.4|Female|[1373.0,20.0]|       0.0|[0.98564249820531...|
|        1|             11.4|Female|[1373.0,20.0]|       0.0|[0.98564249820531...|
|        1|             11.4|Female|[1373.0,20.0]|       0.0|[0.98564249820531...|
|        1|             11.4|Female|[1373.0,20.0]|       0.0|[0.98564249820531...|
|        1|             11.4|Female|[1373.0,20.0]|       0.0|[0.98564249820531...|
|        1|             11.4|Female|[1373.0,20.0]|       0.0|[0.98564249820531...|
|        1|             11.4|Female|[1373.0,20.0]|       0.0|[0.98564249820531...|
|   

In [None]:
true_labels=predictions.select('gender_index')
dt_predictions=predictions.select('prediction')

accuracy = accuracy_score(true_labels.toPandas(), dt_predictions.toPandas())
print("Decision Tree Accuracy =",accuracy*100,"%")

Decision Tree Accuracy = 96.85362517099864 %
