<a href="https://colab.research.google.com/github/dingjun6953/Accelerate-Python-codes-using-proper-libraries/blob/main/PySpark_ML_MultiClassifiers_Dingjun.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=a39f8052b9bb2bf9ba6999affb5804745e07650b76cf3504fd3615f05c9e8d4d
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

This project is to investigate the performances of multiclass classifiers from PySpark ML library via iris dataset and make performance comparisons among them. 
Date: 2023/03/18
Author: Dingjun Chen

In [34]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_iris
import pandas as pd
     

In [35]:
iris = load_iris()
df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
df_iris['label'] = pd.Series(iris.target)
 
print(df_iris.head())
print(df_iris.shape)

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   label  
0      0  
1      0  
2      0  
3      0  
4      0  
(150, 5)


In [36]:
df_iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [37]:
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   label              150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [38]:
spark=SparkSession.builder.appName('Pyspark_ML_MultiClassifiers_Dingjun').getOrCreate()
sqlContext = SQLContext(spark)

data = sqlContext.createDataFrame(df_iris)
print(data.printSchema())
 

root
 |-- sepal length (cm): double (nullable = true)
 |-- sepal width (cm): double (nullable = true)
 |-- petal length (cm): double (nullable = true)
 |-- petal width (cm): double (nullable = true)
 |-- label: long (nullable = true)

None




In [39]:
features = iris.feature_names
va = VectorAssembler(inputCols=features, outputCol='features')
va_df = va.transform(data)
va_df = va_df.select(['features', 'label'])
va_df.show(5)
     

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|    0|
|[4.9,3.0,1.4,0.2]|    0|
|[4.7,3.2,1.3,0.2]|    0|
|[4.6,3.1,1.5,0.2]|    0|
|[5.0,3.6,1.4,0.2]|    0|
+-----------------+-----+
only showing top 5 rows



In [40]:
(train, test) = va_df.randomSplit([0.8, 0.2])


**1. Random Forest classifier**





In [41]:

rfc = RandomForestClassifier(featuresCol="features", labelCol="label")
rfc = rfc.fit(train)
pred = rfc.transform(test)
pred.show(5) 


+-----------------+-----+--------------+-------------+----------+
|         features|label| rawPrediction|  probability|prediction|
+-----------------+-----+--------------+-------------+----------+
|[4.4,3.0,1.3,0.2]|    0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.5,2.3,1.3,0.3]|    0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.4,1.4,0.3]|    0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.6,1.0,0.2]|    0|[20.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.9,0.2]|    0|[18.0,2.0,0.0]|[0.9,0.1,0.0]|       0.0|
+-----------------+-----+--------------+-------------+----------+
only showing top 5 rows



In [43]:
evaluator=MulticlassClassificationEvaluator(predictionCol="prediction")
acc = evaluator.evaluate(pred)
print("Prediction Accuracy: ", acc)
y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Prediction Accuracy:  0.9598997493734336
Confusion Matrix:
[[ 5  0  0]
 [ 0  9  1]
 [ 0  0 10]]


**2. Naive Bayes Classifier**

In [44]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
nb = nb.fit(train)
pred = nb.transform(test)
pred.show(5) 

+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.4,3.0,1.3,0.2]|    0|[-10.647581428227...|[0.70356094834989...|       0.0|
|[4.5,2.3,1.3,0.3]|    0|[-10.322811114830...|[0.58143942740786...|       0.0|
|[4.6,3.4,1.4,0.3]|    0|[-11.782137602771...|[0.71732290275700...|       0.0|
|[4.6,3.6,1.0,0.2]|    0|[-10.861436942224...|[0.81573696669031...|       0.0|
|[4.8,3.4,1.9,0.2]|    0|[-12.525849578605...|[0.67474221025610...|       0.0|
+-----------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [45]:
evaluator=MulticlassClassificationEvaluator(predictionCol="prediction")
acc = evaluator.evaluate(pred)
print("Prediction Accuracy: ", acc)
y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Prediction Accuracy:  0.9598997493734336
Confusion Matrix:
[[ 5  0  0]
 [ 0  9  1]
 [ 0  0 10]]


**3. Decision Tree Classifier**

In [46]:
dtc = DecisionTreeClassifier(featuresCol="features", labelCol="label")
dtc = dtc.fit(train)
pred = dtc.transform(test)
pred.show(5)

+-----------------+-----+--------------+-------------+----------+
|         features|label| rawPrediction|  probability|prediction|
+-----------------+-----+--------------+-------------+----------+
|[4.4,3.0,1.3,0.2]|    0|[45.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.5,2.3,1.3,0.3]|    0|[45.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.4,1.4,0.3]|    0|[45.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.6,3.6,1.0,0.2]|    0|[45.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.9,0.2]|    0|[45.0,0.0,0.0]|[1.0,0.0,0.0]|       0.0|
+-----------------+-----+--------------+-------------+----------+
only showing top 5 rows



In [47]:
evaluator=MulticlassClassificationEvaluator(predictionCol="prediction")
acc = evaluator.evaluate(pred)
print("Prediction Accuracy: ", acc)
y_pred=pred.select("prediction").collect()
y_orig=pred.select("label").collect()
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm)

Prediction Accuracy:  0.9199999999999999
Confusion Matrix:
[[5 0 0]
 [0 9 1]
 [0 1 9]]
