In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('lrexmpl').getOrCreate()

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
data = spark.read.csv('dbfs:/FileStore/shared_uploads/gkantirisrafael@gmail.com/Ecommerce_Customers-2.csv',header=True,inferSchema=True)

In [0]:
data.show(2)

+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|   Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|   Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+
only showing top 2 rows



In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'],outputCol='features')

In [0]:
output = assembler.transform(data)

In [0]:
output.head(1)

Out[18]: [Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [0]:
final_data = output.select('features','Yearly Amount Spent')

In [0]:
#We split our data to train and test randomly
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [0]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                340|
|   mean|  498.4238503764669|
| stddev|   81.1573183889165|
|    min|   266.086340948469|
|    max|  765.5184619388373|
+-------+-------------------+



In [0]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                160|
|   mean| 501.20568750810423|
| stddev|  75.46324394675781|
|    min| 256.67058229005585|
|    max|  744.2218671047146|
+-------+-------------------+



In [0]:
#We create a linear regression model here
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [0]:
lr_model = lr.fit(train_data)

In [0]:
#We evaluate our test data
test_results = lr_model.evaluate(test_data)

In [0]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|  11.098374133056836|
|   6.890073653749141|
| -21.437935224349417|
|-0.19381778679723993|
| -3.3669741534949935|
|   19.10411421516443|
|   4.100187971106948|
|  -4.041479044324888|
|   3.018080432381282|
|  -8.590592903136837|
|   -7.43028701691253|
|   -6.08963577701428|
|  -9.025149465179254|
| -2.2229558564470153|
|  -5.450065494106809|
|    5.66788750867579|
|  -4.276942721102273|
|   6.040717348865883|
|  0.2140388659303767|
|  -8.501562276646155|
+--------------------+
only showing top 20 rows



In [0]:
test_results.rootMeanSquaredError

Out[28]: 10.008254988667312

In [0]:
test_results.r2

Out[29]: 0.9823001885053427

In [0]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [0]:
#We predicts the amount spent
unlabeled_data = test_data.select('features')

In [0]:
predictions = lr_model.transform(unlabeled_data)

In [0]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.7377203726281...|450.68236806317304|
|[30.9716756438877...| 487.7485361031436|
|[31.1239743499119...| 508.3849890641152|
|[31.2606468698795...| 421.5204490437486|
|[31.2681042107507...| 426.8375073273189|
|[31.3123495994443...| 444.4873038127762|
|[31.3662121671876...|426.48869458537797|
|[31.4252268808548...| 534.8081976990868|
|[31.4459724827577...| 481.8588845027473|
|[31.5261978982398...|417.68511909547465|
|[31.7207699002873...| 546.2052204949355|
|[31.7242025238451...| 509.4775230649748|
|[31.8279790554652...| 449.0278970121208|
|[31.8530748017465...|  461.508079318799|
|[31.8745516945853...| 397.7353097403743|
|[31.9480174211613...|  456.252989384222|
|[31.9673209478824...| 450.0267839607545|
|[31.9764800614612...| 324.5537286852343|
|[32.0047530203648...|463.53194225469906|
|[32.0085045178551...|451.69878330540155|
+--------------------+------------