# Spark Machine Learning

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [2]:
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [3]:
CPATH = "/home/bm/spark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/"

### Data

In [54]:
all_data = spark.read.format('libsvm').load(os.path.join(CPATH,'Linear_Regression/sample_linear_regression_data.txt'))
ec = spark.read.csv(os.path.join(CPATH,'Linear_Regression/Ecommerce_Customers.csv'),inferSchema=True,header=True)
csi = spark.read.csv(os.path.join(CPATH,'Linear_Regression/cruise_ship_info.csv'),inferSchema=True,header=True)

### Linear Regression

In [5]:
training = all_data
training.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [6]:
lr = LinearRegression(featuresCol='features',labelCol='label',predictionCol='prediction')

In [7]:
lrModel = lr.fit(training)

In [8]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [9]:
lrModel.intercept

0.14228558260358093

In [10]:
training_summary = lrModel.summary

In [11]:
training_summary.r2

0.027839179518600154

In [12]:
training_summary.rootMeanSquaredError

10.16309157133015

In [13]:
train_data, test_data = all_data.randomSplit([0.7,0.3])

In [14]:
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                332|
|   mean|-0.3959838480376637|
| stddev|   9.79394967602426|
|    min|-28.571478869743427|
|    max| 24.290551295953957|
+-------+-------------------+



In [15]:
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                169|
|   mean| 1.5394552512910975|
| stddev| 11.196261004757154|
|    min|-28.046018037776633|
|    max|  27.78383192005107|
+-------+-------------------+



In [16]:
correct_model = lr.fit(train_data)

In [17]:
test_results = correct_model.evaluate(test_data)

In [18]:
test_results.rootMeanSquaredError

11.406683248194838

In [19]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-26.881082286102746|
|-25.046669375790852|
| -22.42480334943294|
| -22.70123569706494|
| -20.26014374753564|
| -22.17641770252379|
|-18.508255303203832|
|  -17.7547368414282|
| -16.76988031327234|
|-14.749327023147748|
| -18.23494993281964|
| -14.94870358459412|
|-13.578047164720573|
|-12.376791432724321|
|-13.124109378750571|
|-15.328469392448605|
|-15.998521322849411|
|-11.758314737361154|
| -7.731053534575116|
| -10.58786611524042|
+-------------------+
only showing top 20 rows



In [20]:
unlabeled_data = test_data.select('features')

In [21]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [22]:
predictions = correct_model.transform(unlabeled_data)

In [23]:
predictions.show()

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...| -1.1649357516738874|
|(10,[0,1,2,3,4,5,...| -1.7588140526922182|
|(10,[0,1,2,3,4,5,...|  -4.311403833168786|
|(10,[0,1,2,3,4,5,...| -0.8096483932580283|
|(10,[0,1,2,3,4,5,...|  -2.577316669383703|
|(10,[0,1,2,3,4,5,...|  0.7440299383579823|
|(10,[0,1,2,3,4,5,...| -1.3647357348645723|
|(10,[0,1,2,3,4,5,...|  0.3260622704886962|
|(10,[0,1,2,3,4,5,...|-0.29551931260367836|
|(10,[0,1,2,3,4,5,...| -1.1126823044228125|
|(10,[0,1,2,3,4,5,...|  2.5028616605803924|
|(10,[0,1,2,3,4,5,...|-0.42715413871817653|
|(10,[0,1,2,3,4,5,...|  -1.732933424695715|
|(10,[0,1,2,3,4,5,...| -1.5993394984283817|
|(10,[0,1,2,3,4,5,...|  0.0841813146459573|
|(10,[0,1,2,3,4,5,...|  2.3506206670565017|
|(10,[0,1,2,3,4,5,...|   3.076298219478991|
|(10,[0,1,2,3,4,5,...| -0.7093416436717068|
|(10,[0,1,2,3,4,5,...| -4.6798928682650445|
|(10,[0,1,2,3,4,5,...| -1.542487

### ECommerce Data

In [24]:
ec.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [25]:
ec.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [26]:
ec.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)]

In [27]:
for item in ec.head(1)[0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [28]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [29]:
ec.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [32]:
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App',
                                       'Time on Website', 'Length of Membership',
                                       'Yearly Amount Spent'],
                            outputCol='features')

In [33]:
output = assembler.transform(ec)

In [34]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [35]:
output.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826, 587.9511]))]

In [36]:
final_data = output.select('features','Yearly Amount Spent')

In [37]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [38]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [39]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                353|
|   mean| 496.76758100679666|
| stddev|  75.53785092011447|
|    min| 256.67058229005585|
|    max|  725.5848140556806|
+-------+-------------------+



In [40]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                147|
|   mean| 505.42900023058644|
| stddev|  87.69656397120761|
|    min|   266.086340948469|
|    max|  765.5184619388373|
+-------+-------------------+



In [41]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [42]:
lr_model = lr.fit(train_data)

In [43]:
test_results = lr_model.evaluate(test_data)

In [44]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|2.046363078989088...|
|-2.50111042987555...|
|-1.13686837721616...|
|-3.69482222595252...|
|-8.52651282912120...|
|5.684341886080801...|
|6.821210263296962...|
|3.979039320256561...|
|-1.64845914696343...|
|-3.41060513164848...|
|3.410605131648481...|
|1.193711796076968...|
|-2.84217094304040...|
|-5.40012479177676...|
|-1.47792889038100...|
|-5.11590769747272...|
|-1.30739863379858...|
|-1.98951966012828...|
|-3.52429196937009...|
|-1.02318153949454...|
+--------------------+
only showing top 20 rows



In [45]:
test_results.rootMeanSquaredError

2.0651281750740257e-12

In [46]:
test_results.r2

1.0

In [47]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [48]:
unlabeled_data = test_data.select('features')

In [49]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[29.5324289670579...|
|[30.3931845423455...|
|[30.4925366965402...|
|[30.8162006488763...|
|[30.8364326747734...|
|[31.3091926408918...|
|[31.3662121671876...|
|[31.4459724827577...|
|[31.4474464941278...|
|[31.5761319713222...|
|[31.6098395733896...|
|[31.6548096756927...|
|[31.6610498227460...|
|[31.6739155032749...|
|[31.7207699002873...|
|[31.7216523605090...|
|[31.7242025238451...|
|[31.8279790554652...|
|[31.9048571310136...|
|[31.9453957483445...|
+--------------------+
only showing top 20 rows



In [50]:
predictions = lr_model.transform(unlabeled_data)

In [51]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...|408.64035107262544|
|[30.3931845423455...| 319.9288698031961|
|[30.4925366965402...|282.47124571991566|
|[30.8162006488763...| 266.0863409484727|
|[30.8364326747734...|467.50190042699046|
|[31.3091926408918...|432.72071783993306|
|[31.3662121671876...|430.58888255648424|
|[31.4459724827577...|484.87696493512817|
|[31.4474464941278...|418.60274209522566|
|[31.5761319713222...| 541.2265839893287|
|[31.6098395733896...|444.54554965110475|
|[31.6548096756927...| 475.2634237275473|
|[31.6610498227460...| 416.3583535799011|
|[31.6739155032749...| 475.7250679098866|
|[31.7207699002873...| 538.7749334780244|
|[31.7216523605090...|347.77692663187315|
|[31.7242025238451...| 503.3878872879618|
|[31.8279790554652...| 440.0027475469435|
|[31.9048571310136...|473.94985742281966|
|[31.9453957483445...|  657.019923937653|
+--------------------+------------

### Consulting Project

In [55]:
csi.describe().show()

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|
|   mean| Infinity|       null|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|
| stddev|      NaN|       null| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|
|    min|Adventure|    Azamara|   