### **Chapter 6: Linear Regression**
Demo
Basically what we do here is examine a dataset with Ecommerce Customer Data for a company's
website and mobile app. Then we want to see if we can build a regression model that will predict
the customer's yearly spend on the company's product.
First thing to do is start a Spark Session

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
import findspark
findspark.init()

spark-2.4.0-bin-hadoop2.7/
spark-2.4.0-bin-hadoop2.7/python/
spark-2.4.0-bin-hadoop2.7/python/setup.cfg
spark-2.4.0-bin-hadoop2.7/python/pyspark/
spark-2.4.0-bin-hadoop2.7/python/pyspark/resultiterable.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/python/
spark-2.4.0-bin-hadoop2.7/python/pyspark/python/pyspark/
spark-2.4.0-bin-hadoop2.7/python/pyspark/python/pyspark/shell.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/heapq3.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/join.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/version.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/rdd.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/java_gateway.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/find_spark_home.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/_globals.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/worker.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/accumulators.py
spark-2.4.0-bin-hadoop2.7/python/pyspark/mllib/
spark-2.4.0-bin-hadoop2.7/python/pyspark/mllib/feature.py
spark-2.4.0-bin-hadoop2.7/python/pyspark

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
%cd '/content/gdrive/My Drive/LDS9_K273_ONLINE_Đinh Viết Trung'

/content/gdrive/My Drive/LDS9_K273_ONLINE_Đinh Viết Trung


In [4]:
# import libraries
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime
from pyspark.sql.functions import mean, stddev, col, log
from pyspark.sql.functions import to_date, dayofweek, to_timestamp
from pyspark.sql import types 
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from pyspark.sql.functions import year, month
from pyspark.sql.functions import dayofmonth, weekofyear
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import coalesce, first, lit
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import Bucketizer
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.functions import datediff
from pyspark.sql.functions import when

In [5]:
from pyspark import SparkContext
sc = SparkContext()

In [6]:
spark = SparkSession(sc)

In [11]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv("data/Ecommerce_Customers.csv",inferSchema=True,header=True)

In [12]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [13]:
data.show(5)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [14]:
data.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [15]:
for item in data.head():
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [18]:
# Setting Up DataFrame for Machine Learning
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [19]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [20]:
assembler = VectorAssembler( inputCols=["Avg Session Length", "Time on App", 
                            "Time on Website",'Length of Membership'],
                            outputCol="features") # inputs

In [21]:
data_pre = assembler.transform(data)
data_pre.select("features").show(2, False)

+--------------------------------------------------------------------------+
|features                                                                  |
+--------------------------------------------------------------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]|
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]|
+--------------------------------------------------------------------------+
only showing top 2 rows



In [22]:
data_pre.show(2)

+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|   Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|   Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
|   hduke@hotmail.com|4547 Archer Commo...|DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...|
+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+----

In [23]:
final_data = data_pre.select("features",'Yearly Amount Spent')

In [24]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [25]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                344|
|   mean|  497.2687714574026|
| stddev|  81.29523970671858|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [26]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                156|
|   mean| 503.82411376890315|
| stddev|  74.81832232310599|
|    min| 308.52774655803336|
|    max|  684.1634310159512|
+-------+-------------------+



In [28]:
from pyspark.ml.regression import LinearRegression

In [29]:
# Create a Linear Regression Model object
lr = LinearRegression(featuresCol="features", 
                    labelCol='Yearly Amount Spent', 
                    predictionCol='Predict_Yearly Amount Spent')

In [30]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data,)

In [31]:
# Print the coefficients and intercept for linear regression
print("Coefficients: {} Intercept: {}".format(lrModel.coefficients, lrModel.intercept))


Coefficients: [25.59307662714446,38.23602595020023,0.25970174785062894,61.72884063747948] Intercept: -1035.4288034924946


In [32]:
test_results = lrModel.evaluate(test_data)

In [33]:
# Interesting results....
test_results.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
|  4.388396741209476|
|-21.770598440151275|
|-0.1251558515433544|
| 2.7659905825977376|
|  4.202712107864386|
+-------------------+
only showing top 5 rows



In [34]:
# Check test dataset
test_model = lrModel.transform(test_data)

In [35]:
# Inspect results
test_model.select("Predict_Yearly Amount Spent", "Yearly Amount Spent").show(5)


+---------------------------+-------------------+
|Predict_Yearly Amount Spent|Yearly Amount Spent|
+---------------------------+-------------------+
|          388.1090024478119|  392.4973991890214|
|         508.71765227991705|  486.9470538397658|
|          421.4517871084947|  421.3266312569514|
|          429.9547272573359|  432.7207178399336|
|           490.973238341611|  495.1759504494754|
+---------------------------+-------------------+
only showing top 5 rows



In [36]:
print("RMSE: {}".format(test_results.rootMeanSquaredError))
print("MSE: {}".format(test_results.meanSquaredError))
print("r2: {}".format(test_results.r2))

RMSE: 10.155310313968855
MSE: 103.1303275730022
r2: 0.9814577101287018


In [37]:
# Excellent results!

In [38]:
# Save model
lrModel.save('Chapter_6/lrModel_Ecommerce_Customers')

In [39]:
from pyspark.ml.regression import LinearRegressionModel
# Load model from
lrModel2 = LinearRegressionModel.load('Chapter_6/lrModel_Ecommerce_Customers')

In [40]:
# Predict new values (Assuming select test_data)
unlabeled_data = test_data.select('features')


In [41]:
predictions = lrModel2.transform(unlabeled_data)
predictions.show(5)

+--------------------+---------------------------+
|            features|Predict_Yearly Amount Spent|
+--------------------+---------------------------+
|[31.0472221394875...|          388.1090024478119|
|[31.1239743499119...|         508.71765227991705|
|[31.2606468698795...|          421.4517871084947|
|[31.3091926408918...|          429.9547272573359|
|[31.3584771924370...|           490.973238341611|
+--------------------+---------------------------+
only showing top 5 rows

