In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('linear-regression').getOrCreate()

In [3]:
data = spark.read.format('libsvm').load('../data/sample_linear_regression_data.txt')

In [4]:
train_data, test_data = data.randomSplit([.7, .3], seed=42)

In [5]:
_ = train_data.describe().show(), test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                375|
|   mean|  0.724113021004657|
| stddev|  9.982212130320981|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                126|
|   mean|-1.1336593885480712|
| stddev| 11.184561422877946|
|    min|-26.805483428483072|
|    max|  23.52945433069272|
+-------+-------------------+



In [6]:
train_data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-28.046018037776633|(10,[0,1,2,3,4,5,...|
|-26.736207182601724|(10,[0,1,2,3,4,5,...|
| -23.51088409032297|(10,[0,1,2,3,4,5,...|
|-23.487440120936512|(10,[0,1,2,3,4,5,...|
|-22.837460416919342|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-19.884560774273424|(10,[0,1,2,3,4,5,...|
|-19.872991038068406|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|-18.845922472898582|(10,[0,1,2,3,4,5,...|
| -18.27521356600463|(10,[0,1,2,3,4,5,...|
|-17.494200356883344|(10,[0,1,2,3,4,5,...|
| -17.32672073267595|(10,[0,1,2,3,4,5,...|
| -16.71909683360509|(10,[0,1,2,3,4,5,...|
|-16.692207021311106|(10,[0,1,2,3,4,5,...|
| -16.26143027545273|(10,[0,1,2,3,4,5,...|
| -15.86200932757056|(10,[0,1,2,3,4,5,...|
|-15.732088272239245|(10,[0,1,2,3,4,5,...|
|-15.375857723312297|(10,[0,1,2,3,4,5,...|
+----------

In [None]:
from pyspark.ml.regression import LinearRegression

In [7]:
lr = LinearRegression(
    featuresCol='features', labelCol='label', predictionCol='prediction'
)


In [8]:
lr_model = lr.fit(train_data)

In [9]:
print(f'Model coefficients: {lr_model.coefficients}')
print(f'Model intercept: {lr_model.intercept}')

Model coefficients: [0.5918059154882541,1.5074485921726686,-2.092983672586816,3.2111914129063464,0.8256578885505846,1.8989010267402955,-0.06002242466251009,-0.922117578233245,-0.5657647948846599,1.147004045732437]
Model intercept: 0.5292901287039765


In [10]:
training_summary = lr_model.summary

In [11]:
print(f'Training iterations: {training_summary.totalIterations}')
print(f'Training objectiveHistory: {training_summary.objectiveHistory}')


Training iterations: 0
Training objectiveHistory: [0.0]


In [12]:
test_results = lr_model.evaluate(test_data)

In [13]:
test_results.residuals.show()
print(f'RMSE: {test_results.rootMeanSquaredError}')
print(f'R2: {test_results.r2}')

+-------------------+
|          residuals|
+-------------------+
|-28.305902730922302|
|-29.490547492772325|
| -22.86936529151847|
|-21.527682553818114|
|-19.687660427789638|
| -19.79380269286442|
|-18.994876037916928|
|-16.420875732937652|
|  -20.1251816195632|
|-19.488242333300025|
|-16.690287207468383|
|-17.732540358670345|
| -15.17297252570881|
|-13.517777209767612|
|-20.504549034361794|
| -17.52072428950006|
| -15.06463157411349|
|-18.728361362879223|
|-15.995976443402697|
| -15.16368146542394|
+-------------------+
only showing top 20 rows

RMSE: 11.929738804585622
R2: -0.14679155085585793


In [14]:
unlabeled_data = test_data.select('features')
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [15]:
predictions = lr_model.transform(unlabeled_data)
predictions.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|   1.500419302439231|
|(10,[0,1,2,3,4,5,...|   6.540721556576252|
|(10,[0,1,2,3,4,5,...|  1.4369775273526635|
|(10,[0,1,2,3,4,5,...|  1.3156052948594428|
|(10,[0,1,2,3,4,5,...|-0.09510236182489817|
|(10,[0,1,2,3,4,5,...| 0.12648407749270263|
|(10,[0,1,2,3,4,5,...|-0.40745999229762575|
|(10,[0,1,2,3,4,5,...| -1.3827504557268635|
|(10,[0,1,2,3,4,5,...|  2.6965070486236957|
|(10,[0,1,2,3,4,5,...|    2.42284270742401|
|(10,[0,1,2,3,4,5,...|-0.33620505674116263|
|(10,[0,1,2,3,4,5,...|  1.5811910073932323|
|(10,[0,1,2,3,4,5,...| -0.9126865153126812|
|(10,[0,1,2,3,4,5,...| -2.4337353560269603|
|(10,[0,1,2,3,4,5,...|  4.7238640017384945|
|(10,[0,1,2,3,4,5,...|  1.7972086764514907|
|(10,[0,1,2,3,4,5,...| -0.3727532193177282|
|(10,[0,1,2,3,4,5,...|   3.393593882956883|
|(10,[0,1,2,3,4,5,...|   1.173823533651508|
|(10,[0,1,2,3,4,5,...| 0.4009232

In [16]:
e_customers = spark.read.csv('../data/Ecommerce_Customers.csv', inferSchema=True, header=True)

In [17]:
e_customers.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [20]:
e_customers.describe().show()

+-------+-----------------+--------------------+-----------+------------------+------------------+------------------+--------------------+-------------------+
|summary|            Email|             Address|     Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+-------+-----------------+--------------------+-----------+------------------+------------------+------------------+--------------------+-------------------+
|  count|              500|                 500|        500|               500|               500|               500|                 500|                500|
|   mean|             null|                null|       null| 33.05319351819619|12.052487937166134| 37.06044542094859|   3.533461555915055|  499.3140382585909|
| stddev|             null|                null|       null|0.9925631110845354|0.9942156084725424|1.0104889067564033|  0.9992775024112585|   79.3147815497068|
|    min|aaron04@yahoo.com|0001 Mack MillNor..

In [18]:
e_customers.show(5)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [21]:
from pyspark.ml.feature import VectorAssembler

In [22]:
e_customers.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [23]:
assembler = VectorAssembler(
    inputCols=[
        'Avg Session Length',
        'Time on App',
        'Time on Website',
        'Length of Membership',
        'Yearly Amount Spent'
    ], outputCol='features'
)

In [25]:
output = assembler.transform(e_customers)
output.printSchema()
output.show(5)

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Viol

In [26]:
data = output.select('features', 'Yearly Amount Spent')
data.show(5)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
+--------------------+-------------------+
only showing top 5 rows



In [40]:
train_data, test_data = data.randomSplit([.7, .3], seed=42)

In [41]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [42]:
lr_model = lr.fit(train_data)

In [43]:
test_results = lr_model.evaluate(test_data)
print(f'RMSE: {test_results.rootMeanSquaredError}')
print(f'R2: {test_results.r2}')

RMSE: 3.759048757435703e-12
R2: 1.0
