In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
data = 'C:/Users/Akshay Yadav/Desktop/pyworkspace/Ecommerce_Customers.csv'
pdata = pd.read_csv(data)

In [4]:
sdata = spark.createDataFrame(pdata)

In [5]:
sdata.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [16]:
# VectorAssembler to group the features as one 

from pyspark.ml.feature import VectorAssembler

vector = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'],outputCol='Independent_feature')

In [17]:
feature_data = vector.transform(sdata)
feature_data.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent| Independent_feature|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [18]:
final_data = feature_data.select('Independent_feature','Yearly Amount Spent')
final_data.show()

+--------------------+-------------------+
| Independent_feature|Yearly Amount Spent|
+--------------------+-------------------+
|[34.49726773,12.6...|         587.951054|
|[31.92627203,11.1...|        392.2049334|
|[33.00091476,11.3...|        487.5475049|
|[34.30555663,13.7...|         581.852344|
|[33.33067252,12.7...|         599.406092|
|[33.87103788,12.0...|        637.1024479|
|[32.0215955,11.36...|        521.5721748|
|[32.73914294,12.3...|        549.9041461|
|[33.9877729,13.38...|         570.200409|
|[31.93654862,11.8...|        427.1993849|
|[33.99257277,13.3...|        492.6060127|
|[33.87936082,11.5...|        522.3374046|
|[29.53242897,10.9...|        408.6403511|
|[33.19033404,12.9...|        573.4158673|
|[32.38797585,13.1...|        470.4527333|
|[30.73772037,12.6...|        461.7807422|
|[32.1253869,11.73...|        457.8476959|
|[32.33889932,12.0...|        407.7045475|
|[32.18781205,14.7...|        452.3156755|
|[32.61785606,13.9...|        605.0610388|
+----------

In [19]:
## Applying Linear regression

from pyspark.ml.regression import LinearRegression

train_data,test_data = final_data.randomSplit([0.75,0.25])

In [20]:
regressor = LinearRegression(featuresCol='Independent_feature',labelCol = 'Yearly Amount Spent')

regressor =regressor.fit(train_data)

In [21]:
regressor.coefficients

DenseVector([25.6937, 38.89, -0.1386, 61.4857])

In [22]:
regressor.intercept

-1031.2365426122726

In [23]:
pred = regressor.evaluate(test_data)
pred.predictions.show()

+--------------------+-------------------+------------------+
| Independent_feature|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[29.53242897,10.9...|        408.6403511|397.45743962972824|
|[30.73772037,12.6...|        461.7807422| 451.4081430044075|
|[32.1253869,11.73...|        457.8476959| 438.5052651477988|
|[32.17550124,13.3...|        588.7126055| 578.2006362057916|
|[32.18781205,14.7...|        452.3156755| 456.0170937909754|
|[32.33598964,13.0...|        486.8389348| 484.4571200118014|
|[32.33889932,12.0...|        407.7045475| 410.3884278572111|
|[32.73914294,12.3...|        549.9041461| 557.7862435801919|
|[32.89398062,11.5...|        547.2443434| 542.7106825061101|
|[33.02933195,11.7...|        423.1799917| 437.0999478409542|
|[33.07653561,9.60...|         507.212569|499.61500627112787|
|[33.4610563,10.86...|        447.6879065|459.70944811158574|
|[33.50308726,12.8...|        419.9387748|421.08379389319066|
|[33.541

In [24]:
pred.r2

0.9829553202760349