In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
sc

In [7]:
from pyspark.mllib.classification import LogisticRegressionModel,LogisticRegressionWithLBFGS, SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from pyspark.mllib.clustering import *

In [8]:
from pyspark.sql import SparkSession
from pyspark.mllib.stat import Statistics
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.clustering import LDA
import numpy as np

In [9]:
## Read and convert the data into a spark dataframe
path = "/content/drive/MyDrive/lab3_data.txt"
df = spark.read.format("libsvm").load(path)
df.collect()

[Row(label=0.0, features=SparseVector(4, {0: 5.1, 1: 3.5, 2: 1.4, 3: 0.2})),
 Row(label=0.0, features=SparseVector(4, {0: 4.9, 1: 3.0, 2: 1.4, 3: 0.2})),
 Row(label=0.0, features=SparseVector(4, {0: 4.7, 1: 3.2, 2: 1.3, 3: 0.2})),
 Row(label=0.0, features=SparseVector(4, {0: 4.6, 1: 3.1, 2: 1.5, 3: 0.2})),
 Row(label=0.0, features=SparseVector(4, {0: 5.0, 1: 3.6, 2: 1.4, 3: 0.2})),
 Row(label=0.0, features=SparseVector(4, {0: 5.4, 1: 3.9, 2: 1.7, 3: 0.4})),
 Row(label=0.0, features=SparseVector(4, {0: 4.6, 1: 3.4, 2: 1.4, 3: 0.3})),
 Row(label=0.0, features=SparseVector(4, {0: 5.0, 1: 3.4, 2: 1.5, 3: 0.2})),
 Row(label=0.0, features=SparseVector(4, {0: 4.4, 1: 2.9, 2: 1.4, 3: 0.2})),
 Row(label=0.0, features=SparseVector(4, {0: 4.9, 1: 3.1, 2: 1.5, 3: 0.1})),
 Row(label=0.0, features=SparseVector(4, {0: 5.4, 1: 3.7, 2: 1.5, 3: 0.2})),
 Row(label=0.0, features=SparseVector(4, {0: 4.8, 1: 3.4, 2: 1.6, 3: 0.2})),
 Row(label=0.0, features=SparseVector(4, {0: 4.8, 1: 3.0, 2: 1.4, 3: 0.1})),

In [10]:
##Show the schema
df.printSchema()


root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [11]:
##Show the label statistics
features = df.select("features")
features.collect()

[Row(features=SparseVector(4, {0: 5.1, 1: 3.5, 2: 1.4, 3: 0.2})),
 Row(features=SparseVector(4, {0: 4.9, 1: 3.0, 2: 1.4, 3: 0.2})),
 Row(features=SparseVector(4, {0: 4.7, 1: 3.2, 2: 1.3, 3: 0.2})),
 Row(features=SparseVector(4, {0: 4.6, 1: 3.1, 2: 1.5, 3: 0.2})),
 Row(features=SparseVector(4, {0: 5.0, 1: 3.6, 2: 1.4, 3: 0.2})),
 Row(features=SparseVector(4, {0: 5.4, 1: 3.9, 2: 1.7, 3: 0.4})),
 Row(features=SparseVector(4, {0: 4.6, 1: 3.4, 2: 1.4, 3: 0.3})),
 Row(features=SparseVector(4, {0: 5.0, 1: 3.4, 2: 1.5, 3: 0.2})),
 Row(features=SparseVector(4, {0: 4.4, 1: 2.9, 2: 1.4, 3: 0.2})),
 Row(features=SparseVector(4, {0: 4.9, 1: 3.1, 2: 1.5, 3: 0.1})),
 Row(features=SparseVector(4, {0: 5.4, 1: 3.7, 2: 1.5, 3: 0.2})),
 Row(features=SparseVector(4, {0: 4.8, 1: 3.4, 2: 1.6, 3: 0.2})),
 Row(features=SparseVector(4, {0: 4.8, 1: 3.0, 2: 1.4, 3: 0.1})),
 Row(features=SparseVector(4, {0: 4.3, 1: 3.0, 2: 1.1, 3: 0.1})),
 Row(features=SparseVector(4, {0: 5.8, 1: 4.0, 2: 1.2, 3: 0.2})),
 Row(featu

In [12]:
df.count()

150

In [13]:
df.describe("label").show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|               150|
|   mean|               1.0|
| stddev|0.8192319205190406|
|    min|               0.0|
|    max|               2.0|
+-------+------------------+



In [14]:
labels = df.select("label")
labels.collect()

[Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=0.0),
 Row(label=1.0),
 Row(label=1.0),
 Row(label=1.0),
 Row(label=1.0),
 Row(label=1.0),
 Row(label=1.0),
 Row(label=1.0),
 Row(label=1.0),
 Row(label=1.0

In [16]:
from pyspark.ml.classification import LogisticRegression
logr = LogisticRegression(featuresCol='features', labelCol='label')

In [18]:
##Train a classifier on this data
model=logr.fit(df)

In [19]:
df.select('features').first()

Row(features=SparseVector(4, {0: 5.1, 1: 3.5, 2: 1.4, 3: 0.2}))

In [20]:
model.transform(df.select('features')).show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|(4,[0,1,2,3],[5.1...|[137.492999312864...|[1.0,6.8703342077...|       0.0|
|(4,[0,1,2,3],[4.9...|[108.386676092253...|[1.0,9.2388692799...|       0.0|
|(4,[0,1,2,3],[4.7...|[126.731322172062...|[1.0,2.5292525429...|       0.0|
|(4,[0,1,2,3],[4.6...|[117.480609999370...|[1.0,6.6247907786...|       0.0|
|(4,[0,1,2,3],[5.0...|[145.566635001565...|[1.0,4.6667736631...|       0.0|
|(4,[0,1,2,3],[5.4...|[142.647448094925...|[1.0,6.4806036248...|       0.0|
|(4,[0,1,2,3],[4.6...|[134.429325065329...|[1.0,1.0543175194...|       0.0|
|(4,[0,1,2,3],[5.0...|[130.439661842581...|[1.0,1.0677103675...|       0.0|
|(4,[0,1,2,3],[4.4...|[109.966059166030...|[1.0,3.3420291324...|       0.0|
|(4,[0,1,2,3],[4.9...|[117.297157939444...|[1.0,3.1503214847...|       0.0|
|(4,[0,1,2,3

In [21]:
df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(4,[0,1,2,3],[5.1...|
|  0.0|(4,[0,1,2,3],[4.9...|
|  0.0|(4,[0,1,2,3],[4.7...|
|  0.0|(4,[0,1,2,3],[4.6...|
|  0.0|(4,[0,1,2,3],[5.0...|
|  0.0|(4,[0,1,2,3],[5.4...|
|  0.0|(4,[0,1,2,3],[4.6...|
|  0.0|(4,[0,1,2,3],[5.0...|
|  0.0|(4,[0,1,2,3],[4.4...|
|  0.0|(4,[0,1,2,3],[4.9...|
|  0.0|(4,[0,1,2,3],[5.4...|
|  0.0|(4,[0,1,2,3],[4.8...|
|  0.0|(4,[0,1,2,3],[4.8...|
|  0.0|(4,[0,1,2,3],[4.3...|
|  0.0|(4,[0,1,2,3],[5.8...|
|  0.0|(4,[0,1,2,3],[5.7...|
|  0.0|(4,[0,1,2,3],[5.4...|
|  0.0|(4,[0,1,2,3],[5.1...|
|  0.0|(4,[0,1,2,3],[5.7...|
|  0.0|(4,[0,1,2,3],[5.1...|
+-----+--------------------+
only showing top 20 rows



In [24]:
predictions = model.transform(df)

In [35]:
##Report the performance of the classifier training data
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [26]:
predictions.select("label", "prediction", "probability").show()

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[1.0,6.8703342077...|
|  0.0|       0.0|[1.0,9.2388692799...|
|  0.0|       0.0|[1.0,2.5292525429...|
|  0.0|       0.0|[1.0,6.6247907786...|
|  0.0|       0.0|[1.0,4.6667736631...|
|  0.0|       0.0|[1.0,6.4806036248...|
|  0.0|       0.0|[1.0,1.0543175194...|
|  0.0|       0.0|[1.0,1.0677103675...|
|  0.0|       0.0|[1.0,3.3420291324...|
|  0.0|       0.0|[1.0,3.1503214847...|
|  0.0|       0.0|[1.0,1.7208172560...|
|  0.0|       0.0|[1.0,1.1271142229...|
|  0.0|       0.0|[1.0,1.7235413977...|
|  0.0|       0.0|[1.0,1.1177220733...|
|  0.0|       0.0|[1.0,5.7929918489...|
|  0.0|       0.0|[1.0,2.1436836810...|
|  0.0|       0.0|[1.0,8.0315238956...|
|  0.0|       0.0|[1.0,2.9146070713...|
|  0.0|       0.0|[1.0,3.5905777894...|
|  0.0|       0.0|[1.0,3.1058885787...|
+-----+----------+--------------------+
only showing top 20 rows



In [33]:
multiclass_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = multiclass_evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9866666666666667


In [36]:
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)



In [49]:
precision_evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
precision = precision_evaluator.evaluate(predictions)
print('Precision: ', precision)

Precision:  0.9866666666666666


In [50]:
recall_evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
Recall = recall_evaluator.evaluate(predictions)
print('Recall: ', Recall)

Recall:  0.9866666666666666


In [51]:
f1_evaluator = MulticlassClassificationEvaluator(metricName="f1")
f1_score = f1_evaluator.evaluate(predictions)
print('f1 score:', f1_score)

f1 score: 0.9866666666666666
