# Init Spark 

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Regression") \
    .getOrCreate()

# Get Data 

https://archive.ics.uci.edu/ml/datasets/Real+estate+valuation+data+set

In [None]:
!wget https://raw.githubusercontent.com/subashgandyer/datasets/main/Real%20estate.csv -O real_estate.csv

# Path Setup 

In [None]:
import os
path = os.getcwd()
print(path)

# Read Data
Inspired from [anujsyal.com](https://anujsyal.com/introduction-to-pyspark-ml-lib-build-your-first-linear-regression-model)

In [None]:
real_estate = spark.read.option("inferSchema", "true").csv(f"file://{path}/real_estate.csv", header=True)

# Check Data 

In [None]:
real_estate.show(3)

Attribute Information:

The inputs are as follows
- X1 =the transaction date (for example, 2013.250=2013 March, 2013.500=2013 June, etc.)
- X2 =the house age (unit: year)
- X3 =the distance to the nearest MRT station (unit: meter)
- X4 =the number of convenience stores in the living circle on foot (integer)
- X5 =the geographic coordinate, latitude. (unit: degree)
- X6 =the geographic coordinate, longitude. (unit: degree)

The output is as follow
- Y = house price of unit area (10000 New Taiwan Dollar/Ping, where Ping is a local unit, 1 Ping = 3.3 meter squared)

In [None]:
real_estate.printSchema()

# Features

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[ 
 'X1 transaction date',
 'X2 house age',
 'X3 distance to the nearest MRT station',
 'X4 number of convenience stores'],
 outputCol='features')

data_set = assembler.transform(real_estate)

In [None]:
data_set.select(['features','Y house price of unit area']).show(2, False)

In [None]:
(train_data, test_data) = data_set.randomSplit([0.7,0.3])

In [None]:
train_data.count()

In [None]:
test_data.count()

# Linear Regression

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.5, labelCol='Y house price of unit area')

In [None]:
print(lr.explainParams())

In [None]:
lr_model = lr.fit(train_data)

In [None]:
lr_model.coefficients

In [None]:
test_stats = lr_model.evaluate(test_data)
print(f"RMSE: {test_stats.rootMeanSquaredError}")
print(f"MSE: {test_stats.meanSquaredError}")

In [None]:
test

In [None]:
lr_predictions = lr_model.transform(test_data)
lr_predictions.select("prediction","Y house price of unit area").show(5)

# Stop Spark 

In [None]:
spark.stop()