In [0]:
#!apt-get update -q
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://us.mirrors.quenda.co/apache/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()

In [0]:
import pyspark
#sc = pyspark.SparkContext('local[*]')
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
#!wget https://github.com/asifahmed90/pyspark-ML-in-Colab/blob/master/BostonHousing.csv

In [0]:
from google.colab import files
files.upload()

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
dataset = spark.read.csv('BostonHousing.csv',inferSchema=True, header =True)

In [0]:
dataset.printSchema()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



# **Converting all the features from different columns into a single column and we can call the new vector column as ‘Attributes’ in the outputCol.**

In [0]:
#Input all the features in one vector column
assembler = VectorAssembler(inputCols=['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'], outputCol = 'Attributes')
output = assembler.transform(dataset)
#Input vs Output
finalized_data = output.select("Attributes","medv")
finalized_data.show()

+--------------------+----+
|          Attributes|medv|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03237,0.0,2.18...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.63796,0.0,8.14...|18.2|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
+--------------------+----+
only showing top 20 rows



# **‘Attributes’ are the input features from all the columns and ‘medv’ is the target column.**

# **split the training and testing data according to our dataset (0.8 and 0.2 in this case).**

In [0]:
#Split training and testing data
train_data,test_data = finalized_data.randomSplit([0.8,0.2])
regressor = LinearRegression(featuresCol = 'Attributes', labelCol = 'medv')
#Learn to fit the model from training set
regressor = regressor.fit(train_data)
#To predict the prices on testing set
pred = regressor.evaluate(test_data)
#Predict the model
pred.predictions.show()

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.01301,35.0,1.5...|32.7|30.109213599459387|
|[0.01709,90.0,2.0...|30.1|24.947354543080586|
|[0.02009,95.0,2.6...|50.0| 42.89507153637031|
|[0.02763,75.0,2.9...|30.8| 31.39574797988461|
|[0.03306,0.0,5.19...|20.6|22.272808845165976|
|[0.03768,80.0,1.5...|34.6| 34.38793486612563|
|[0.04294,28.0,15....|20.6|26.945114542397743|
|[0.04417,70.0,2.2...|24.8|30.457052300924275|
|[0.0456,0.0,13.89...|23.3| 26.88999111040776|
|[0.04932,33.0,2.1...|28.2|  32.4354142632953|
|[0.04981,21.0,5.6...|23.4|24.109716942832236|
|[0.05083,0.0,5.19...|22.2|  22.2055569327958|
|[0.0536,21.0,5.64...|25.0|27.712614871824613|
|[0.05602,0.0,2.46...|50.0| 35.76739058710187|
|[0.0566,0.0,3.41,...|23.6| 30.50966232446786|
|[0.05735,0.0,4.49...|26.6|27.777536454184123|
|[0.06047,0.0,2.46...|29.6|24.613131394848747|
|[0.07022,0.0,4.05...|23.2| 25.66424119472355|
|[0.07244,60.

# **print the coefficient and intercept of the regression model by using the following command:**

In [0]:
#coefficient of the regression model
coeff = regressor.coefficients
#X and Y intercept
intr = regressor.intercept
print ("The coefficient of the model is : %a" %coeff)
print ("The Intercept of the model is : %f" %intr)

The coefficient of the model is : DenseVector([-0.088, 0.0446, 0.0305, 2.9166, -17.5738, 3.6781, -0.0052, -1.5153, 0.2857, -0.0118, -0.9381, 0.0082, -0.5444])
The Intercept of the model is : 37.908562


# **Analyzing our model statistically by importing RegressionEvaluator module from Pyspark.**

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="medv", predictionCol="prediction", metricName="rmse")
# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)
# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)
# Mean Absolute Error
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)
# r2 - coefficient of determination
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 4.710
MSE: 22.187
MAE: 3.096
r2: 0.749
