# Linear Regression Project

In [1]:
import findspark
findspark.init('/home/fernando/spark-2.4.6-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lrproject').getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
data = spark.read.csv('Linear_Regression/Ecommerce_Customers.csv', inferSchema = True, header=True)

In [4]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [5]:
for item in data.head(1)[0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


We want to predict Yearly Amount Spent:

We'll work just with the numeric data as features.

In [6]:
data.count()

500

## Set up dataframe for Machine Learning

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

Features:
- 'Avg Session Length',
- 'Time on App',
- 'Time on Website',
- 'Length of Membership',

Target:
- 'Yearly Amount Spent'

In [9]:
assembler = VectorAssembler(inputCols=[
                                'Avg Session Length',
                                'Time on App',
                                'Time on Website',
                                'Length of Membership',
                            ],
                            outputCol='features')

Transform the data: **before train test split**

In [10]:
output = assembler.transform(data)

In [11]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [12]:
output.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
+--------------------+
only showing top 5 rows



In [13]:
for item in output.head(1)[0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005
[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]


In [14]:
final_data = output.select('features', 'Yearly Amount Spent')

### Ready for Machine Learning in Spark

In [15]:
final_data.show(3)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
+--------------------+-------------------+
only showing top 3 rows



## Split data

In [16]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [17]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                349|
|   mean|  501.3203612219935|
| stddev|     79.46383453995|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [18]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                151|
|   mean| 494.67690770079247|
| stddev|  79.03705092666182|
|    min| 302.18954780965197|
|    max|  689.7876041747194|
+-------+-------------------+



In [19]:
lr = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent')

In [20]:
lr_model = lr.fit(train_data)

In [21]:
test_results = lr_model.evaluate(test_data)

Compare predicted value vs the actual in the original dataset

In [22]:
test_results.residuals.show(5)

+--------------------+
|           residuals|
+--------------------+
|  10.009265267824503|
| -11.609630641179479|
|-0.05856256967018...|
|  -3.678160556556975|
|   7.080238209961863|
+--------------------+
only showing top 5 rows



### RootMeanSquaredError: error in unitsof dataset

In [23]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In this case, error of **\$9.76 USD**, vs the mean of **\$500** the model performs really good!

In [24]:
test_results.rootMeanSquaredError

10.216140526282887

In [25]:
test_results.r2

0.983181085994329

In [26]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



## Deploy model
Predict Label based on new features

In [27]:
unlabeled_data = test_data.select('features')

In [28]:
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|[29.5324289670579...|
|[30.3931845423455...|
|[30.5743636841713...|
|[30.8794843441274...|
|[30.9716756438877...|
+--------------------+
only showing top 5 rows



In [29]:
predictions = lr_model.transform(unlabeled_data)

In [31]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...|  398.631085804803|
|[30.3931845423455...| 331.5385004443731|
|[30.5743636841713...|442.12297632773584|
|[30.8794843441274...|493.88476054141165|
|[30.9716756438877...|487.55837154693086|
|[31.1239743499119...|507.87783900005957|
|[31.3091926408918...|429.54622896832643|
|[31.3895854806643...| 410.1949095044961|
|[31.4474464941278...|427.03972409875973|
|[31.5257524169682...| 449.6822494386597|
|[31.5261978982398...| 418.8410947263683|
|[31.5741380228732...| 558.4947836556405|
|[31.5761319713222...| 543.1640191700549|
|[31.6098395733896...| 427.1510793367179|
|[31.6253601348306...|380.97056310200173|
|[31.7366356860502...| 494.2186794630095|
|[31.8124825597242...|395.78876783869714|
|[31.8164283341993...| 518.4951304004915|
|[31.8279790554652...|449.53929499362084|
|[31.9262720263601...|  380.703769776964|
+--------------------+------------

Copared to original:

In [32]:
test_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[29.5324289670579...|  408.6403510726275|
|[30.3931845423455...|  319.9288698031936|
|[30.5743636841713...| 442.06441375806565|
|[30.8794843441274...|  490.2065999848547|
|[30.9716756438877...|  494.6386097568927|
|[31.1239743499119...|  486.9470538397658|
|[31.3091926408918...|  432.7207178399336|
|[31.3895854806643...|  410.0696110599829|
|[31.4474464941278...|   418.602742095224|
|[31.5257524169682...|  443.9656268098819|
|[31.5261978982398...|  409.0945261923378|
|[31.5741380228732...|  544.4092721605869|
|[31.5761319713222...|  541.2265839893283|
|[31.6098395733896...| 444.54554965110816|
|[31.6253601348306...|  376.3369007569242|
|[31.7366356860502...|  496.9334462555319|
|[31.8124825597242...|  392.8103449837972|
|[31.8164283341993...| 501.12249150365636|
|[31.8279790554652...|  440.0027475469415|
|[31.9262720263601...|  392.2049334443264|
+----------