## 1. Running Pyspark in Jupyter Notebook

In [1]:
# spark-3.2.2-bin-hadoop3.2
# install findspark using pip
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1




In [2]:
import findspark

findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

## 2. Analysis and Regression on Boston Housing Dataset

### 2.1 Importing the dataset

In [4]:
# import the boston housing dataset
dataset = spark.read.csv('data/BostonHousing.csv', header=True, inferSchema=True)

### 2.2 Data Exploration : Transformations

In [5]:
# Transformations
# Convert all the features from different columns into a single column
# Let's call this new vector column as 'Attributes' in the outputCol parameter
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# Input all the features in one vector column
assembler = VectorAssembler(inputCols=['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'], outputCol='Attributes')

# Use the assembler object to transform the dataset
output = assembler.transform(dataset)

# Input vs Output
finalized_data = output.select('Attributes', 'medv')

finalized_data.show()

+--------------------+----+
|          Attributes|medv|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03237,0.0,2.18...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.63796,0.0,8.14...|18.2|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
+--------------------+----+
only showing top 20 rows



### 2.3 Splitting the dataset

In [6]:
# split training and testing data
train_data, test_data = finalized_data.randomSplit([0.8, 0.2])

### 2.4 Learn and predict the Linear Regression

In [7]:
regressor = LinearRegression(featuresCol='Attributes', labelCol='medv')

# Fit the model from the training data
regressor = regressor.fit(train_data)

# Predict the output for the test data
pred_results = regressor.evaluate(test_data)

# Predict the model
pred_results.predictions.show()



+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.01381,80.0,0.4...|50.0| 40.99234580781855|
|[0.01439,60.0,2.9...|29.1|31.718477818040853|
|[0.01709,90.0,2.0...|30.1|25.166621887817712|
|[0.01951,17.5,1.3...|33.0| 23.13869167935512|
|[0.02543,55.0,3.7...|23.9| 27.93171690439138|
|[0.02731,0.0,7.07...|21.6| 25.28478869656535|
|[0.02985,0.0,2.18...|28.7| 25.19280558015525|
|[0.03041,0.0,5.19...|18.5| 19.01263292794857|
|[0.03237,0.0,2.18...|33.4| 28.41169475012503|
|[0.03466,35.0,6.0...|19.4|23.342511801554636|
|[0.03502,80.0,4.9...|28.5| 33.77173172342891|
|[0.03578,20.0,3.3...|45.4| 38.97604974116794|
|[0.03871,52.5,5.3...|23.2|27.178636942880445|
|[0.03932,0.0,3.41...|22.0|27.660238991915033|
|[0.04203,28.0,15....|22.9|29.431010977827448|
|[0.04294,28.0,15....|20.6| 27.72804839897894|
|[0.04666,80.0,1.5...|30.3|32.903768278630196|
|[0.05023,35.0,6.0...|17.1|  19.9621106227574|
|[0.05302,0.0

### 2.5 Print the coefficients and intercept for linear regression

In [8]:
# Coefficient of the regression model
coeff = regressor.coefficients

# X and Y intercept
intercept = regressor.intercept

print('Coefficients: %s' % str(coeff))
print('Intercept: %s' % str(intercept))


Coefficients: [-0.10811352639957023,0.05106316232531196,0.04617665449513621,3.2782831464761752,-21.530120251995523,3.7037096720122347,0.010525252750265073,-1.5548193917372728,0.2924655132960359,-0.011137946309134606,-0.9644053882628786,0.009353938462252858,-0.549101222121598]
Intercept: 38.75295476840933


### 2.6 Evaluating the model

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator

eval = RegressionEvaluator(labelCol='medv', predictionCol='prediction', metricName='rmse')

# Root Mean Square Error
rmse = eval.evaluate(pred_results.predictions, {eval.metricName: "rmse"})
print("Root Mean Square Error (RMSE) on test data = %g" % rmse)

# Mean Square Error
mse = eval.evaluate(pred_results.predictions, {eval.metricName: "mse"})
print('Root Mean Square Error: ', rmse)

# Mean Absolute Error
mae = eval.evaluate(pred_results.predictions, {eval.metricName: "mae"})
print('Mean Absolute Error: ', mae)

# R2
r2 = eval.evaluate(pred_results.predictions, {eval.metricName: "r2"})
print('R2: ', r2)

Root Mean Square Error (RMSE) on test data = 4.43927
Root Mean Square Error:  4.43927179267644
Mean Absolute Error:  3.469671474744002
R2:  0.7367837381989424


### 2.7 Clustering the dataset

In [13]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Trains a k-means model.
kmeans = KMeans(featuresCol='Attributes').setK(2).setSeed(1)
model = kmeans.fit(finalized_data)

# Make predictions
predictions = model.transform(finalized_data)
predictions.show()

+--------------------+----+----------+
|          Attributes|medv|prediction|
+--------------------+----+----------+
|[0.00632,18.0,2.3...|24.0|         0|
|[0.02731,0.0,7.07...|21.6|         0|
|[0.02729,0.0,7.07...|34.7|         0|
|[0.03237,0.0,2.18...|33.4|         0|
|[0.06905,0.0,2.18...|36.2|         0|
|[0.02985,0.0,2.18...|28.7|         0|
|[0.08829,12.5,7.8...|22.9|         0|
|[0.14455,12.5,7.8...|27.1|         0|
|[0.21124,12.5,7.8...|16.5|         0|
|[0.17004,12.5,7.8...|18.9|         0|
|[0.22489,12.5,7.8...|15.0|         0|
|[0.11747,12.5,7.8...|18.9|         0|
|[0.09378,12.5,7.8...|21.7|         0|
|[0.62976,0.0,8.14...|20.4|         0|
|[0.63796,0.0,8.14...|18.2|         0|
|[0.62739,0.0,8.14...|19.9|         0|
|[1.05393,0.0,8.14...|23.1|         0|
|[0.7842,0.0,8.14,...|17.5|         0|
|[0.80271,0.0,8.14...|20.2|         0|
|[0.7258,0.0,8.14,...|18.2|         0|
+--------------------+----+----------+
only showing top 20 rows



### 3. Churn analysis in Spark

