# Linear Regression Code

- Predict customer expenditure amount (continuous value => Regression)
- Linear Regression Algorithm
- Convert realistic data into Spark MLlib format

In [1]:
# start spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [2]:
# load linear regression lib
from pyspark.ml.regression import LinearRegression

In [3]:
# read in the input csv file.
data = spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, header=True)

## Explore the data

In [4]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [5]:
#data.describe().show()

In [6]:
for item in data.head():
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


## Convert data to MLlib format
It needs to be in the form of two columns: ("label","features")

In [7]:
# import helpers
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
# create and configure the vector assembler with the desired 'features' columns
assembler = VectorAssembler(
    inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'],
    outputCol='features')

# assemble the 'features'
output = assembler.transform(data)

In [9]:
# the assembler output is the input with the added column 'features'
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
# 'features' is an array with the assembled data
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [16]:
# select only features and label columns
final_data = output.select([output['features'], output['Yearly Amount Spent'].alias('expenditure')])

In [17]:
# split into training and testing sets
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [18]:
train_data.describe().show()

+-------+------------------+
|summary|       expenditure|
+-------+------------------+
|  count|               343|
|   mean|497.33460940319003|
| stddev|  76.1699510809515|
|    min|256.67058229005585|
|    max| 765.5184619388373|
+-------+------------------+



In [19]:
test_data.describe().show()

+-------+------------------+
|summary|       expenditure|
+-------+------------------+
|  count|               157|
|   mean|503.63852295542176|
| stddev| 85.88191921757424|
|    min|  266.086340948469|
|    max| 744.2218671047146|
+-------+------------------+



## Create and Evaluate Linear Regression Model

In [21]:
# create linear regression modeñl object
lr = LinearRegression(featuresCol='features', labelCol='expenditure', predictionCol='expenditure_pred')

In [22]:
# fit the model with the training data
lr_model = lr.fit(train_data)

In [25]:
# print the coefficients and intercept for linear regression
# Coefficients => 'Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'
print("Coefficients: {}\nIntercept: {}".format(lr_model.coefficients,lr_model.intercept))

Coefficients: [26.161405080006446,38.03906632391081,0.31653059095304314,61.86527226556498]
Intercept: -1053.8051091940522


In [27]:
# evaluate the model with the test_data
test_results = lr_model.evaluate(test_data)

In [28]:
# print the residuals (difference between prediction and labels)
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 11.562328805557968|
|-15.905871639119255|
|-3.0663272170067444|
|-2.5025954738886753|
|  4.705524081528324|
| -21.25330626156608|
| -6.443467709521428|
|  3.207483628826026|
|-3.0068239039612763|
| -8.803456780573356|
|-1.6854533711995145|
|  17.89235593232712|
|-1.3766377041021656|
| -4.247998401065217|
| -4.372430185573421|
|  8.560184674617005|
| 10.539645328837764|
|  -5.75169106944702|
|  6.406702313570918|
|-0.3145344023171788|
+-------------------+
only showing top 20 rows



In [36]:
# print some statistical evaluation indicators of the model
print('MAE:  {}\nMSE:  {}\nRMSE: {}\nR2:   {}'.format(test_results.meanAbsoluteError, 
                                                       test_results.meanSquaredError,
                                                       test_results.rootMeanSquaredError,
                                                       test_results.r2))

MAE:  8.164564100703513
MSE:  110.6571191317799
RMSE: 10.519368761089227
R2:   0.9849009045229011


In [37]:
# analyze indicators against original data
final_data.describe().show()

+-------+------------------+
|summary|       expenditure|
+-------+------------------+
|  count|               500|
|   mean| 499.3140382585909|
| stddev|  79.3147815497068|
|    min|256.67058229005585|
|    max| 765.5184619388373|
+-------+------------------+



- An RMSE of 10.5$ on a expenditure label of 500$ is not bad (much smaller)
- The R2 value indicates that our model explains 98% of the (test) data

## Apply model to unlabeled data

In [39]:
# create a set of unlabeled data using the test_data
unlabeled_data = test_data.select('features')

In [42]:
# apply the model and predict 'expenditure'
expenditure_predictions = lr_model.transform(unlabeled_data)

In [43]:
expenditure_predictions.show()

+--------------------+------------------+
|            features|  expenditure_pred|
+--------------------+------------------+
|[30.7377203726281...| 450.2184133906719|
|[30.8162006488763...|281.99221258758826|
|[30.8364326747734...|470.56822764399635|
|[30.8794843441274...|492.70919545874335|
|[31.0472221394875...|387.79187510749307|
|[31.1239743499119...|508.20036010133185|
|[31.1280900496166...| 563.6961544565761|
|[31.3091926408918...| 429.5132342111076|
|[31.4252268808548...| 533.7735425587232|
|[31.4474464941278...| 427.4061988757974|
|[31.5761319713222...| 542.9120373605278|
|[31.6098395733896...|426.65319371878104|
|[31.7216523605090...| 349.1535643359748|
|[31.7656188210424...|500.80208003667235|
|[31.8124825597242...|397.18277516937064|
|[31.8512531286083...| 464.4320619921814|
|[31.9096268275227...| 552.9063903444014|
|[31.9453957483445...|  662.771615007099|
|[31.9480174211613...| 455.5141745793269|
|[32.0047530203648...| 464.0605155229466|
+--------------------+------------