In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.appName('ecom_app').getOrCreate()

22/02/15 10:49:14 WARN Utils: Your hostname, ganesh-pi resolves to a loopback address: 127.0.1.1; using 192.168.1.119 instead (on interface eth0)
22/02/15 10:49:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/15 10:49:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, header=True)
df.printSchema() 

[Stage 1:>                                                          (0 + 1) / 1]

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



                                                                                

In [10]:
for item in df.head(1)[0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [11]:
#One of the first steps in the preprocessing is the vector assembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [12]:
#Yearly amount spent will be the label for the model
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website',
                            'Length of Membership'], outputCol='features')

output = assembler.transform(df)
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [14]:
#Assembler creates a dense vector of the input columns
output.select('features').show(5, truncate=False)

+----------------------------------------------------------------------------+
|features                                                                    |
+----------------------------------------------------------------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]  |
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]  |
|[33.000914755642675,11.330278057777512,37.110597442120856,4.104543202376424]|
|[34.30555662975554,13.717513665142507,36.72128267790313,3.120178782748092]  |
|[33.33067252364639,12.795188551078114,37.53665330059473,4.446308318351434]  |
+----------------------------------------------------------------------------+
only showing top 5 rows



In [15]:
final_data = output.select('features', 'Yearly Amount Spent')
final_data.show(truncate=False)

+-----------------------------------------------------------------------------+-------------------+
|features                                                                     |Yearly Amount Spent|
+-----------------------------------------------------------------------------+-------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]   |587.9510539684005  |
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]   |392.2049334443264  |
|[33.000914755642675,11.330278057777512,37.110597442120856,4.104543202376424] |487.54750486747207 |
|[34.30555662975554,13.717513665142507,36.72128267790313,3.120178782748092]   |581.8523440352177  |
|[33.33067252364639,12.795188551078114,37.53665330059473,4.446308318351434]   |599.4060920457634  |
|[33.871037879341976,12.026925339755056,34.47687762925054,5.493507201364199]  |637.102447915074   |
|[32.02159550138701,11.366348309710526,36.68377615286961,4.685017246570912]   |521.5721747578274  |


In [16]:
#train test split
train_data, test_data = final_data.randomSplit([0.7, 0.3])
train_data.describe().show()

                                                                                

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                339|
|   mean| 499.31045821198313|
| stddev|  78.91493490021426|
|    min| 256.67058229005585|
|    max|  744.2218671047146|
+-------+-------------------+



In [19]:
lr = LinearRegression(labelCol='Yearly Amount Spent')
lr_model = lr.fit(train_data)

22/02/15 11:03:16 WARN Instrumentation: [ccff367b] regParam is zero, which might cause numerical instability and overfitting.
22/02/15 11:03:17 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/02/15 11:03:17 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/02/15 11:03:17 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [20]:
test_results = lr_model.evaluate(test_data)
test_results.residuals.show()



+--------------------+
|           residuals|
+--------------------+
|  -4.901993767480633|
|  5.1535251639049875|
|  -7.389621343808187|
|  -8.684406321629467|
| -1.4957723895812478|
|  -4.707883212055776|
| -1.6883052194291395|
| -10.444731503972548|
|  -6.569694779126223|
|  -7.260808808522711|
|  2.6077204738078876|
| -4.9936552364245586|
|  -9.192026178810352|
| -2.7615391064959454|
| -12.606629496784535|
|  -4.509603166714442|
|  -3.368420849125812|
|   5.632629136383628|
|-0.36596679533954557|
|   7.686120039471064|
+--------------------+
only showing top 20 rows



In [21]:
test_results.rootMeanSquaredError

9.298702845848846

In [22]:
test_results.r2

0.9865394631203264

In [23]:
unlablelled_data = test_data.select('features')

predictions = lr_model.transform(unlablelled_data)
predictions.show(truncate=False)

+-----------------------------------------------------------------------------+------------------+
|features                                                                     |prediction        |
+-----------------------------------------------------------------------------+------------------+
|[30.879484344127498,13.280432242922114,36.93615937845674,3.5851606351613436] |495.1085937523353 |
|[30.971675643887767,11.731364294077403,36.07455114016891,4.4263640805293125] |489.48508459298773|
|[31.061325156716126,12.357638107209013,36.16604163340145,4.089330841235956]  |494.9450794017098 |
|[31.128090049616628,13.278956228597714,37.38718052656558,4.626075291951958]  |565.9370930686841 |
|[31.260646869879523,13.266760352944493,36.971195097457155,2.267251114447051] |422.8224036465326 |
|[31.268104210750717,12.132509111641538,35.4567981489283,3.0720761414868827]  |428.1784163858797 |
|[31.389585480664397,10.994223919350974,38.074452419704535,3.4288599039280125]|411.75791627941203|
|[31.44744

In [24]:
#isnt this so cool