#  linear regression documentation example

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('linearreg').getOrCreate()

In [5]:
from pyspark.ml.regression import LinearRegression

In [9]:
#load training data which is in libsvm format
trainingdata=spark.read.format('libsvm').load('linear_reg.txt')


In [11]:
trainingdata.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [13]:
#create instance of our model
#parameters are taken from dataframe
lr=LinearRegression(featuresCol='features',labelCol='label',predictionCol='prediction')

In [14]:
#training the model
lrModel=lr.fit(trainingdata)

In [15]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [16]:
lrModel.intercept

0.14228558260358093

In [17]:
training_summary=lrModel.summary

In [18]:
training_summary.r2

0.027839179518600154

In [19]:
training_summary.rootMeanSquaredError

10.16309157133015

In [20]:
 #train and test data split
all_data=spark.read.format('libsvm').load('linear_reg.txt')


In [25]:
#split dataframe into two dataframes randomly
#first one has 70% and 30% is in next
#below syntax is called tuple unpcking
train_data,test_data=all_data.randomSplit([0.7,0.3])

In [32]:
train_data

DataFrame[label: double, features: vector]

In [27]:
test_data

DataFrame[label: double, features: vector]

In [33]:
train_data.describe().show()
test_data.describe().show()

+-------+--------------------+
|summary|               label|
+-------+--------------------+
|  count|                 368|
|   mean|-0.15255365144501226|
| stddev|   10.60547469600228|
|    min| -28.571478869743427|
|    max|   27.78383192005107|
+-------+--------------------+

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                133|
|   mean| 1.3897822830936388|
| stddev|  9.422027665735516|
|    min|-26.805483428483072|
|    max|  21.57719950299147|
+-------+-------------------+



In [34]:
correct_model=lr.fit(train_data)

In [35]:
test_results=correct_model.evaluate(test_data)

In [36]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|  -28.0351359311489|
|-21.809636371011994|
| -22.94460689514251|
| -15.71438906173832|
| -16.46688944079381|
|-17.119280191358346|
|-17.054320433642093|
|-14.090518355694984|
|  -11.6379968246146|
|-11.150176901464677|
| -7.566570965467918|
|-10.124612469339102|
|-13.360173261469543|
| -6.787453707737729|
|-12.213877803161719|
| -9.538474069320621|
| -7.633908748995232|
| -8.403431293698436|
|-10.347696729391814|
| -8.856116765612189|
+-------------------+
only showing top 20 rows



In [37]:
test_results.rootMeanSquaredError

9.552687168650573

In [38]:
#using evaluate on test data,we are comparing predictions on labels that are already assigned in test data

In [39]:
#deploying model on unlabeld data
unlabeled_data= test_data.select('features')

In [40]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [41]:
predictions=correct_model.transform(unlabeled_data)

In [42]:
predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...|  1.229652502665829|
|(10,[0,1,2,3,4,5,...|-1.7012477193109778|
|(10,[0,1,2,3,4,5,...|  2.732529636183837|
|(10,[0,1,2,3,4,5,...|-1.6123316709376287|
|(10,[0,1,2,3,4,5,...|-0.5596028234157371|
|(10,[0,1,2,3,4,5,...| 0.8578499159056137|
|(10,[0,1,2,3,4,5,...| 1.3222321614028472|
|(10,[0,1,2,3,4,5,...| 1.5319425668387947|
|(10,[0,1,2,3,4,5,...|-0.8412833868368981|
|(10,[0,1,2,3,4,5,...|-1.3174794795681823|
|(10,[0,1,2,3,4,5,...| -4.844375437372243|
|(10,[0,1,2,3,4,5,...| -2.005740742948827|
|(10,[0,1,2,3,4,5,...| 1.4820057615027018|
|(10,[0,1,2,3,4,5,...| -4.853095970151097|
|(10,[0,1,2,3,4,5,...|  0.598102538146092|
|(10,[0,1,2,3,4,5,...| -1.500873738933207|
|(10,[0,1,2,3,4,5,...|-2.6617571870346874|
|(10,[0,1,2,3,4,5,...|-1.8902827469574892|
|(10,[0,1,2,3,4,5,...| 0.5960741216067309|
|(10,[0,1,2,3,4,5,...|-0.5530809540084042|
+----------

In [43]:
#our model predicted the labels

# --------------------------------
# Linear regression custom example

In [44]:
from pyspark.sql import SparkSession

In [46]:
spark=SparkSession.builder.appName('customlinearreg').getOrCreate()

In [47]:
from pyspark.ml.regression import LinearRegression

In [48]:
data=spark.read.csv('linear_reg.csv',inferSchema=True,header=True)

In [49]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [51]:
data.head(1)[0].asDict()

{'Email': 'mstephenson@fernandez.com',
 'Address': '835 Frank TunnelWrightmouth, MI 82180-9605',
 'Avatar': 'Violet',
 'Avg Session Length': 34.49726772511229,
 'Time on App': 12.65565114916675,
 'Time on Website': 39.57766801952616,
 'Length of Membership': 4.0826206329529615,
 'Yearly Amount Spent': 587.9510539684005}

In [53]:
#setup data frame for ML
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [55]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [56]:
#you take list of input column which are a list of columns.
#you have a output column which is a feature ,can be named anything.
#VectorAssembler grabs all the columns and turns them into a single vector.
assembler=VectorAssembler(inputCols=[
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership'],
 outputCol='features')

In [57]:
output=assembler.transform(data)

In [59]:
#has new features column
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [61]:
output.head(1)[0].asDict()

{'Email': 'mstephenson@fernandez.com',
 'Address': '835 Frank TunnelWrightmouth, MI 82180-9605',
 'Avatar': 'Violet',
 'Avg Session Length': 34.49726772511229,
 'Time on App': 12.65565114916675,
 'Time on Website': 39.57766801952616,
 'Length of Membership': 4.0826206329529615,
 'Yearly Amount Spent': 587.9510539684005,
 'features': DenseVector([34.4973, 12.6557, 39.5777, 4.0826])}

In [62]:
final_data=output.select('features','Yearly Amount Spent')

In [63]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [64]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [66]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                361|
|   mean|  495.2924043802644|
| stddev|  79.56617520569341|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [67]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                139|
|   mean|  509.7587132951081|
| stddev|  77.97457207266814|
|    min|  282.4712457199145|
|    max|  700.9170916173961|
+-------+-------------------+



In [68]:
lr=LinearRegression(labelCol='Yearly Amount Spent')

In [69]:
lr_model=lr.fit(train_data)

In [70]:
test_results=lr_model.evaluate(test_data)

In [73]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 10.458298461197046|
|-11.533451641452189|
| -5.060727241152392|
| 10.634169803048394|
| -7.161117224733744|
| 10.737596923290312|
|  0.915556018097277|
|  3.387865313181635|
| -8.273959567596137|
|-14.334598829151673|
| 18.222842356029105|
|  -3.91437556018883|
| -26.31096898139765|
|  1.715439902201581|
| 7.5506590936724365|
|-1.4910120774760571|
| -9.882499117263364|
| -8.758652655878848|
|-1.8309523886602506|
| 12.122981198921764|
+-------------------+
only showing top 20 rows



In [75]:
test_results.rootMeanSquaredError

9.998543955609648

In [76]:
test_results.r2

0.983438364040229

In [77]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [78]:
unlabeled=test_data.select('features')

In [79]:
unlabeled.show()

+--------------------+
|            features|
+--------------------+
|[29.5324289670579...|
|[30.3931845423455...|
|[30.4925366965402...|
|[30.7377203726281...|
|[31.1280900496166...|
|[31.1695067987115...|
|[31.3895854806643...|
|[31.4459724827577...|
|[31.5261978982398...|
|[31.5741380228732...|
|[31.6005122003032...|
|[31.6253601348306...|
|[31.6739155032749...|
|[31.7366356860502...|
|[31.8209982016720...|
|[31.8627411090001...|
|[31.8648325480987...|
|[31.8854062999117...|
|[31.9120759292006...|
|[31.9262720263601...|
+--------------------+
only showing top 20 rows



In [83]:
predictions=lr_model.transform(unlabeled_data)

In [89]:
predictions.show()

Py4JJavaError: An error occurred while calling o548.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 59.0 failed 1 times, most recent failure: Lost task 0.0 in stage 59.0 (TID 58, localhost, executor driver): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$1: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException: requirement failed: BLAS.dot(x: Vector, y:Vector) was given Vectors with non-matching sizes: x.size = 10, y.size = 4
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.linalg.BLAS$.dot(BLAS.scala:104)
	at org.apache.spark.ml.regression.LinearRegressionModel.predict(LinearRegression.scala:708)
	at org.apache.spark.ml.regression.LinearRegressionModel.predict(LinearRegression.scala:645)
	at org.apache.spark.ml.PredictionModel$$anonfun$1.apply(Predictor.scala:215)
	at org.apache.spark.ml.PredictionModel$$anonfun$1.apply(Predictor.scala:214)
	... 22 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor87.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$1: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.lang.IllegalArgumentException: requirement failed: BLAS.dot(x: Vector, y:Vector) was given Vectors with non-matching sizes: x.size = 10, y.size = 4
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.linalg.BLAS$.dot(BLAS.scala:104)
	at org.apache.spark.ml.regression.LinearRegressionModel.predict(LinearRegression.scala:708)
	at org.apache.spark.ml.regression.LinearRegressionModel.predict(LinearRegression.scala:645)
	at org.apache.spark.ml.PredictionModel$$anonfun$1.apply(Predictor.scala:215)
	at org.apache.spark.ml.PredictionModel$$anonfun$1.apply(Predictor.scala:214)
	... 22 more
