# BDA - Project SPark MLlib Vs Mahout performance comparision

### Submitted by:
#### Name      : Muhammad Amin Ghias
#### ERP ID    : 25366

Date : 2nd June 2022

(on Filtered reduced dataset for comparrision with Mahout)

In [2]:
!pip install pyspark



In [3]:
import os
import time
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
# Visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

from matplotlib import rcParams
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (18,4)})
rcParams['figure.figsize'] = 18,4

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [6]:
rnd_seed=23
np.random.seed=rnd_seed
np.random.set_state=rnd_seed

In [7]:
spark = SparkSession.builder.master("local[2]").appName("Linear-Regression-Gas-Sensor").getOrCreate()

In [8]:
spark

In [9]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext



<pyspark.sql.context.SQLContext at 0x7fd4a63cd910>

In [10]:
df = spark.read.csv('gsd_train_7.csv',header=True)

In [11]:
type(df)

pyspark.sql.dataframe.DataFrame

In [12]:
df.count()

192158

In [13]:
df.show(2)

+---------+---------+----------+----------+----------+----------+----------+--------+
|R8 (MOhm)|R9 (MOhm)|R10 (MOhm)|R11 (MOhm)|R12 (MOhm)|R13 (MOhm)|R14 (MOhm)|CO (ppm)|
+---------+---------+----------+----------+----------+----------+----------+--------+
|  28.6584|  23.6376|   25.6229|   26.5903|   25.8244|   21.4715|   28.2509|   15.56|
|  41.2509|  47.3693|   50.0936|   52.0519|   48.1165|   45.7841|   53.4763|    4.44|
+---------+---------+----------+----------+----------+----------+----------+--------+
only showing top 2 rows



In [14]:
df.printSchema()

root
 |-- R8 (MOhm): string (nullable = true)
 |-- R9 (MOhm): string (nullable = true)
 |-- R10 (MOhm): string (nullable = true)
 |-- R11 (MOhm): string (nullable = true)
 |-- R12 (MOhm): string (nullable = true)
 |-- R13 (MOhm): string (nullable = true)
 |-- R14 (MOhm): string (nullable = true)
 |-- CO (ppm): string (nullable = true)



In [15]:
df.columns

['R8 (MOhm)',
 'R9 (MOhm)',
 'R10 (MOhm)',
 'R11 (MOhm)',
 'R12 (MOhm)',
 'R13 (MOhm)',
 'R14 (MOhm)',
 'CO (ppm)']

In [16]:
df = df.withColumn("R8 (MOhm)",col("R8 (MOhm)").cast( DoubleType()))
df = df.withColumn("R9 (MOhm)",col("R9 (MOhm)").cast( DoubleType()))
df = df.withColumn("R10 (MOhm)",col("R10 (MOhm)").cast( DoubleType()))
df = df.withColumn("R11 (MOhm)",col("R11 (MOhm)").cast( DoubleType()))
df = df.withColumn("R12 (MOhm)",col("R12 (MOhm)").cast( DoubleType()))
df = df.withColumn("R13 (MOhm)",col("R13 (MOhm)").cast( DoubleType()))
df = df.withColumn("R14 (MOhm)",col("R14 (MOhm)").cast( DoubleType()))
df = df.withColumn("CO (ppm)",col("CO (ppm)").cast( DoubleType()))


In [17]:
df.printSchema()

root
 |-- R8 (MOhm): double (nullable = true)
 |-- R9 (MOhm): double (nullable = true)
 |-- R10 (MOhm): double (nullable = true)
 |-- R11 (MOhm): double (nullable = true)
 |-- R12 (MOhm): double (nullable = true)
 |-- R13 (MOhm): double (nullable = true)
 |-- R14 (MOhm): double (nullable = true)
 |-- CO (ppm): double (nullable = true)



In [18]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler_train = VectorAssembler(inputCols = [ 'R8 (MOhm)', 'R9 (MOhm)','R10 (MOhm)','R11 (MOhm)',
                                                'R12 (MOhm)', 'R13 (MOhm)', 'R14 (MOhm)'], outputCol = 'features')


data = vectorAssembler_train.transform(df)

data = data.withColumn('label', col('CO (ppm)'))

data = data.select(['features', 'label'])
data.show(3)


+--------------------+-----+
|            features|label|
+--------------------+-----+
|[28.6584,23.6376,...|15.56|
|[41.2509,47.3693,...| 4.44|
|[28.7735,23.1497,...|13.33|
+--------------------+-----+
only showing top 3 rows



In [19]:
train_df = data

In [20]:
from pyspark.ml.regression import LinearRegression
lr_f = LinearRegression(featuresCol = 'features', labelCol='label', predictionCol='pred_CO (ppm)', 
                        maxIter=10, regParam=0.0, solver="normal", standardization=False)
start = time.perf_counter()
lr_modelf = lr_f.fit(train_df)
end = time.perf_counter()
duration_fit = format((end-start),'.4f')

print("Model Fitting Time Duration - {}".format(duration_fit))
print("Coefficients: " + str(lr_modelf.coefficients))
print("Intercept: " + str(lr_modelf.intercept))

start = time.perf_counter()
yfpredictions = lr_modelf.transform(train_df)
end = time.perf_counter()
duration_pred = format((end-start),'.4f')

trainingSummary = lr_modelf.summary

print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
print("Model Fitting Time Duration - {}".format(duration_fit))
print("Model predicting Time Duration - {}".format(duration_pred))

Model Fitting Time Duration - 2.4237
Coefficients: [0.30081264162524546,0.3993613747375286,-0.882862902992174,0.08704524840469419,0.7670340312456152,-0.6808103928106293,-0.12107128035192033]
Intercept: 11.878724666034875
numIterations: 0
objectiveHistory: [0.0]
+--------------------+
|           residuals|
+--------------------+
|  4.1576698489674495|
|   1.667667169619219|
|  2.8199489909088644|
|  1.4753256737263634|
|  -4.987259711766912|
|   2.538538703874625|
|  -4.127314611269143|
|-0.24152559280603292|
|  -5.186313973167703|
| -11.855827895995214|
|  -2.944930260898623|
|    8.14396134836217|
| -2.9117204617391064|
|  1.4774750647391883|
|   5.238772616964059|
|  1.7282566444707843|
|  -7.414802331076708|
|  -5.183093367308658|
|  5.9160729909229435|
|     5.9282411825947|
+--------------------+
only showing top 20 rows

RMSE: 4.689602
r2: 0.467559
Model Fitting Time Duration - 2.4237
Model predicting Time Duration - 0.0358




In [21]:
yfpredandlabels = yfpredictions.select('pred_CO (ppm)', 'label')

y_pred_f = yfpredandlabels.toPandas()

y_pred_f.to_csv("ylRf_Mllib_c.csv",index=False)

In [22]:
y_pred_f.head()

Unnamed: 0,pred_CO (ppm),label
0,11.40233,15.56
1,2.772333,4.44
2,10.510051,13.33
3,9.634674,11.11
4,11.65726,6.67


### Using the RegressionEvaluator from pyspark.ml package:

In [23]:
evaluator = RegressionEvaluator(predictionCol='pred_CO (ppm)', labelCol='label', metricName='rmse')
print("RMSE: {0}".format(evaluator.evaluate(yfpredandlabels)))

RMSE: 4.68960181346083


In [24]:
evaluator = RegressionEvaluator(predictionCol='pred_CO (ppm)', labelCol='label', metricName='mae')
print("MAE: {0}".format(evaluator.evaluate(yfpredandlabels)))

MAE: 3.5839476762586693


In [25]:
evaluator = RegressionEvaluator(predictionCol='pred_CO (ppm)', labelCol='label', metricName='r2')
print("R2: {0}".format(evaluator.evaluate(yfpredandlabels)))

R2: 0.4675586106094035


### Using the RegressionMetrics from pyspark.mllib package:

In [26]:
# mllib is old so the methods are available in rdd
metrics = RegressionMetrics(yfpredandlabels.rdd)



In [27]:
print("RMSE: {0}".format(metrics.rootMeanSquaredError))

RMSE: 4.68960181346083


In [28]:
print("MAE: {0}".format(metrics.meanAbsoluteError))

MAE: 3.5839476762586693


In [29]:
print("R2: {0}".format(metrics.r2))

R2: 0.4675586106094035
