In [1]:
!pip install --ignore-install -q pyspark
!pip install --ignore-install -q findspark

In [3]:
import findspark
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder.appName('lr_example').getOrCreate()
from pyspark.ml.regression import LinearRegression

data = spark.read.csv('Ecommerce_Customers.csv',inferSchema=True,header=True)
# Print the Schema of the dataframe
data.printSchema()
data.show()


root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|

In [4]:
data.head()
for item in data.head():
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [5]:
# Setting up DataFrame for Machine Learning
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

data.columns

# We need to change the data into the form of two columns ("label","features")
assembler = VectorAssembler(
    inputCols=["Avg Session Length","Time on App", "Time on Website", "Length of Membership"],
    outputCol="features"
)

output = assembler.transform(data)
output.select("features").show()
output.show()


+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+----------------+--

In [6]:
# Final dataset with two columns ("features", "label")
final_data = output.select("features","Yearly Amount Spent")
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [7]:
# Split into training and testing datasets
train_data, test_data = final_data.randomSplit([0.7,0.3])
train_data.describe().show()
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                350|
|   mean| 497.92314959820334|
| stddev|  77.51238777933304|
|    min|   266.086340948469|
|    max|  744.2218671047146|
+-------+-------------------+

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                150|
|   mean|  502.5594451328276|
| stddev|  83.54631522152927|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [8]:
# Create a linear regression model object
lr = LinearRegression(labelCol='Yearly Amount Spent')

# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

# print the coefficients and intercept for linear regression
print("Coefficients: {} Intercept {}".format(lrModel.coefficients,lrModel.intercept))


Coefficients: [25.546796475699512,38.45860403623326,0.32783102307395096,61.23659318634983] Intercept -1037.1793157705858


In [11]:
test_result = lrModel.evaluate(test_data)
test_result.residuals.show()

# A lower RMSE indicates better model performance.
# In your case, an RMSE of 9.9396 means that, on average, the model's predictions are about 9.94 units away from the actual values.
print("RMSE:{}".format(test_result.rootMeanSquaredError))


# A lower MSE indicates better model performance.
# In your case, an MSE of 98.7957 is related to the RMSE, as MSE is the square of RMSE (approximately).
print("MSE: {}".format(test_result.meanSquaredError))
spark.stop()

+--------------------+
|           residuals|
+--------------------+
| -12.777786385365175|
|  10.228446894609647|
|   6.537123649710111|
|    9.64030781737415|
|  -5.889545246782177|
|  -9.373594781756026|
| -17.372122702984598|
| -14.046461282931887|
| -1.5102373711976043|
|  -11.11162756281783|
|   6.974907463639454|
|  -9.592309938727965|
| 0.28110377663188046|
| -2.6300781970840035|
| -1.5204235915674644|
| -17.273128895636376|
|  11.695728396111576|
|   11.16162594870417|
|-0.08143226841849582|
|  -9.123128973695657|
+--------------------+
only showing top 20 rows

RMSE:9.939602028329633
MSE: 98.79568848157456


To determine if your model is performing well, consider:
Comparing your model's performance to a baseline model (e.g., a simple average predictor).
Evaluating your model on multiple metrics (e.g., MAE, R-squared, etc.).
Considering the specific requirements and constraints of your problem.
Remember, model evaluation is context-dependent, and "good" performance varies depending on the specific use case.