In [65]:
import os
import findspark
findspark.init(os.getenv('SPARK_HOME'))
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [66]:
spark = SparkSession.builder.appName("lrex").getOrCreate()

In [67]:
train = spark.read.format("libsvm").load("sample_linear_regression_data.txt")
train.show()

22/08/12 15:09:10 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [68]:
lr = LinearRegression(featuresCol="features", labelCol="label", predictionCol="prediction")

In [69]:
lrModel = lr.fit(train)

22/08/12 15:09:11 WARN Instrumentation: [970256ad] regParam is zero, which might cause numerical instability and overfitting.


In [70]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [71]:
lrModel.intercept

0.14228558260358096

In [72]:
train_summary = lrModel.summary

In [73]:
train_summary.r2, train_summary.rootMeanSquaredError, train_summary.meanSquaredError, train_summary.meanAbsoluteError

(0.027839179518600154,
 10.16309157133015,
 103.28843028724194,
 8.145215527783876)

In [74]:
all_data = spark.read.format("libsvm").load("sample_linear_regression_data.txt")
train, test = all_data.randomSplit([0.85, 0.15])

22/08/12 15:09:13 WARN LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


In [75]:
train.describe().show()
test.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                426|
|   mean| 0.4988886547410205|
| stddev| 10.324168874923872|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                 75|
|   mean|-1.1176702266664653|
| stddev| 10.242079671544493|
|    min|-28.046018037776633|
|    max| 27.111027963108548|
+-------+-------------------+



In [76]:
correct_model = lr.fit(train)

22/08/12 15:09:14 WARN Instrumentation: [15dfa6ed] regParam is zero, which might cause numerical instability and overfitting.


In [77]:
test_results = correct_model.evaluate(test)

In [78]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-27.021328751219254|
| -22.15181474182574|
|-21.250237281801724|
|-15.953050696294376|
|-17.852037714360545|
|-18.855318225440328|
|-15.329101106684911|
| -16.56833152214717|
|-15.450383884022568|
|-14.002603052043648|
|-13.760579074178464|
| -14.92715487127733|
| -7.650983452579414|
| -10.70048686393485|
| -7.030742360491198|
| -8.808054481439154|
|-10.086263982601125|
| -6.609471570970441|
|  -9.21112230013844|
| -9.768025492946204|
+-------------------+
only showing top 20 rows





In [79]:
test_results.r2, test_results.rootMeanSquaredError, test_results.meanSquaredError, test_results.meanAbsoluteError

(0.006323623538848411,
 10.141352080843342,
 102.84702202762558,
 7.98643613917675)

In [82]:
unlabeled_data = test.select("features")

In [84]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [85]:
predictions = correct_model.transform(unlabeled_data)

In [86]:
predictions.show()

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...|-1.0246892865573785|
|(10,[0,1,2,3,4,5,...|  2.094332126036525|
|(10,[0,1,2,3,4,5,...|  1.377246243733316|
|(10,[0,1,2,3,4,5,...|-1.5411496605889676|
|(10,[0,1,2,3,4,5,...| 0.7866380884845297|
|(10,[0,1,2,3,4,5,...|  2.163111204129222|
|(10,[0,1,2,3,4,5,...|-0.8222482445922006|
|(10,[0,1,2,3,4,5,...| 0.6168189563525955|
|(10,[0,1,2,3,4,5,...| 0.0745261607102708|
|(10,[0,1,2,3,4,5,...|-1.3462681033356045|
|(10,[0,1,2,3,4,5,...| 1.2020032853222755|
|(10,[0,1,2,3,4,5,...| 3.0489873713104885|
|(10,[0,1,2,3,4,5,...| -2.390369660001042|
|(10,[0,1,2,3,4,5,...| 1.0527620092708005|
|(10,[0,1,2,3,4,5,...|-2.2682794936348984|
|(10,[0,1,2,3,4,5,...|-0.1840011595091155|
|(10,[0,1,2,3,4,5,...|  1.577600582046262|
|(10,[0,1,2,3,4,5,...|-0.8719337004847976|
|(10,[0,1,2,3,4,5,...| 1.8365013299912105|
|(10,[0,1,2,3,4,5,...| 2.6179263341334784|
+----------