In [16]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression



In [18]:
spark = SparkSession.builder.appName("this").getOrCreate()
data = spark.read.csv("/Users/david/gitProjects/Sparkmllib/Ecommerce-Customers.csv",inferSchema=True, header=True)
data.show()


+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [56]:

assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website','Length of Membership'],outputCol = 'features')

output = assembler.transform(data)

final_data = output.select('features','Yearly Amount Spent')
train,test = final_data.randomSplit([0.7,0.3])

lr = LinearRegression(labelCol = 'Yearly Amount Spent')

lr_model = lr.fit(train)

testresults = lr_model.evaluate(test)

testresults.residuals.show()
print (testresults.rootMeanSquaredError)
print(testresults.r2)

+-------------------+
|          residuals|
+-------------------+
|-10.853952142453522|
| -0.346335803090426|
| 10.628836797329427|
| 10.678738661052876|
|-3.2456992862119023|
| 3.3009667873658373|
| 3.7435771480714948|
| -4.746453504370493|
| 2.3073456773437897|
|-15.126439922006739|
| -2.802902523992202|
| 18.095450226555613|
| 17.702909427285988|
|-0.5422047372411498|
| -26.86376774890266|
|-0.7103734093308844|
|-6.2238181863751265|
| 2.2785865618860157|
| -5.400052671742799|
|-17.399021369212107|
+-------------------+
only showing top 20 rows

10.71952560242756
0.9786898442337945


In [31]:
data.printSchema()
for item in data.head(1)[0]:
    print(item)

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [32]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [36]:
assembler = VectorAssembler(inputCols=[
                            'Avg Session Length',
                            'Time on App',
                            'Time on Website',
                            '],
                            outputCol = 'features')


In [38]:
output = assembler.transform(data)

In [43]:
output.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 'features']

In [57]:
final_data = output.select('features','Yearly Amount Spent')
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [48]:
test.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                146|
|   mean|  499.5397791168749|
| stddev|  69.86672309710617|
|    min| 347.77692663187264|
|    max|  708.9351848669818|
+-------+-------------------+



In [50]:
train.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                354|
|   mean| 499.22093609669963|
| stddev|  82.99149908031536|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [58]:
unlableddata = test.select('features')

In [61]:
unlableddata.show()

+--------------------+
|            features|
+--------------------+
|[30.3931845423455...|
|[30.5743636841713...|
|[30.7377203726281...|
|[31.1695067987115...|
|[31.2681042107507...|
|[31.3091926408918...|
|[31.3662121671876...|
|[31.4252268808548...|
|[31.4459724827577...|
|[31.5741380228732...|
|[31.5761319713222...|
|[31.6005122003032...|
|[31.6098395733896...|
|[31.6610498227460...|
|[31.6739155032749...|
|[31.7216523605090...|
|[31.7242025238451...|
|[31.7366356860502...|
|[31.7656188210424...|
|[31.9048571310136...|
+--------------------+
only showing top 20 rows



In [63]:
predic = model.transform(unlableddata)

In [64]:
predic.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.3931845423455...| 330.2809057805989|
|[30.5743636841713...| 440.8077328605614|
|[30.7377203726281...|449.52844772676076|
|[31.1695067987115...| 416.1967161956745|
|[31.2681042107507...| 425.6013952251649|
|[31.3091926408918...| 428.4829974221559|
|[31.3662121671876...|426.12749891727685|
|[31.4252268808548...| 533.8150857882915|
|[31.4459724827577...|481.67263353131943|
|[31.5741380228732...| 557.4798366793905|
|[31.5761319713222...| 542.2212286952195|
|[31.6005122003032...| 460.2158711379252|
|[31.6098395733896...|426.31198141092045|
|[31.6610498227460...|  416.472685675963|
|[31.6739155032749...| 501.3524581476415|
|[31.7216523605090...|  349.077783057636|
|[31.7242025238451...|508.21168565843004|
|[31.7366356860502...| 492.9838091126842|
|[31.7656188210424...|501.11510134544847|
|[31.9048571310136...| 490.4245834215103|
+--------------------+------------