# Init Spark 

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Regression") \
    .getOrCreate()

# Get Data 

https://archive.ics.uci.edu/ml/datasets/Real+estate+valuation+data+set

In [2]:
!wget https://raw.githubusercontent.com/subashgandyer/datasets/main/Real%20estate.csv -O real_estate.csv

--2022-04-20 15:58:04--  https://raw.githubusercontent.com/subashgandyer/datasets/main/Real%20estate.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21968 (21K) [text/plain]
Saving to: ‘real_estate.csv’


2022-04-20 15:58:04 (27.8 MB/s) - ‘real_estate.csv’ saved [21968/21968]



# Path Setup 

In [3]:
import os
path = os.getcwd()
print(path)

/home/hadoop/BDLC_FS22/V9


# Read Data
Inspired from [anujsyal.com](https://anujsyal.com/introduction-to-pyspark-ml-lib-build-your-first-linear-regression-model)

In [4]:
real_estate = spark.read.option("inferSchema", "true").csv(f"file://{path}/real_estate.csv", header=True)

                                                                                

# Check Data 

In [5]:
real_estate.show(3)

[Stage 2:>                                                          (0 + 1) / 1]

+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
| No|X1 transaction date|X2 house age|X3 distance to the nearest MRT station|X4 number of convenience stores|X5 latitude|X6 longitude|Y house price of unit area|
+---+-------------------+------------+--------------------------------------+-------------------------------+-----------+------------+--------------------------+
|  1|           2012.917|        32.0|                              84.87882|                             10|   24.98298|   121.54024|                      37.9|
|  2|           2012.917|        19.5|                              306.5947|                              9|   24.98034|   121.53951|                      42.2|
|  3|           2013.583|        13.3|                              561.9845|                              5|   24.98746|   121.54391|                      47.3|
+---+-------------------+---

                                                                                

Attribute Information:

The inputs are as follows
- X1 =the transaction date (for example, 2013.250=2013 March, 2013.500=2013 June, etc.)
- X2 =the house age (unit: year)
- X3 =the distance to the nearest MRT station (unit: meter)
- X4 =the number of convenience stores in the living circle on foot (integer)
- X5 =the geographic coordinate, latitude. (unit: degree)
- X6 =the geographic coordinate, longitude. (unit: degree)

The output is as follow
- Y = house price of unit area (10000 New Taiwan Dollar/Ping, where Ping is a local unit, 1 Ping = 3.3 meter squared)

In [6]:
real_estate.printSchema()

root
 |-- No: integer (nullable = true)
 |-- X1 transaction date: double (nullable = true)
 |-- X2 house age: double (nullable = true)
 |-- X3 distance to the nearest MRT station: double (nullable = true)
 |-- X4 number of convenience stores: integer (nullable = true)
 |-- X5 latitude: double (nullable = true)
 |-- X6 longitude: double (nullable = true)
 |-- Y house price of unit area: double (nullable = true)



# Features

In [7]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[ 
 'X1 transaction date',
 'X2 house age',
 'X3 distance to the nearest MRT station',
 'X4 number of convenience stores'],
 outputCol='features')

data_set = assembler.transform(real_estate)

In [8]:
data_set.select(['features','Y house price of unit area']).show(2, False)

[Stage 3:>                                                          (0 + 1) / 1]

+-----------------------------+--------------------------+
|features                     |Y house price of unit area|
+-----------------------------+--------------------------+
|[2012.917,32.0,84.87882,10.0]|37.9                      |
|[2012.917,19.5,306.5947,9.0] |42.2                      |
+-----------------------------+--------------------------+
only showing top 2 rows



                                                                                

In [9]:
(train_data, test_data) = data_set.randomSplit([0.7,0.3])

In [10]:
train_data.count()

                                                                                

295

In [11]:
test_data.count()

                                                                                

119

# Linear Regression

In [12]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.5, labelCol='Y house price of unit area')

In [13]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.5)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: Y house price of unit area)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)
maxIter: max

In [14]:
lr_model = lr.fit(train_data)

                                                                                

In [15]:
lr_model.coefficients

DenseVector([6.0436, -0.2397, -0.0057, 0.9647])

In [18]:
test_stats = lr_model.evaluate(test_data)
print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"MSE: {test_stats.meanSquaredError}")

RMSE: 9.089567758621191
MSE: 82.62024203856588


In [None]:
test

In [55]:
lr_predictions = lr_model.transform(test_data)
lr_predictions.select("prediction","Y house price of unit area").show(5)

+------------------+--------------------------+
|        prediction|Y house price of unit area|
+------------------+--------------------------+
| 37.40686513290348|                      40.3|
| 45.54258544502591|                      46.7|
|26.816375760630763|                      23.8|
| 40.46145879786127|                      34.3|
| 37.53515381587931|                      50.5|
+------------------+--------------------------+
only showing top 5 rows



# Stop Spark 

In [56]:
spark.stop()