In [1]:
from __future__ import print_function

from pyspark.ml.regression import LinearRegression

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

In [12]:
# Create a SparkSession (Note, the config section is only for Windows!)
spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

# Load up our data and convert it to the format MLLib expects.
inputLines = spark.sparkContext.textFile("regression.txt")
data = inputLines.map(lambda x: x.split(",")).map(lambda x: (float(x[0]), Vectors.dense(float(x[1]))))

In [15]:
data.collect()

[(-1.74, DenseVector([1.66])),
 (1.24, DenseVector([-1.18])),
 (0.29, DenseVector([-0.4])),
 (-0.13, DenseVector([0.09])),
 (-0.39, DenseVector([0.38])),
 (-1.79, DenseVector([1.73])),
 (0.71, DenseVector([-0.77])),
 (1.39, DenseVector([-1.48])),
 (1.15, DenseVector([-1.43])),
 (0.13, DenseVector([-0.07])),
 (0.05, DenseVector([-0.07])),
 (1.9, DenseVector([-1.8])),
 (1.48, DenseVector([-1.42])),
 (0.32, DenseVector([-0.3])),
 (-1.11, DenseVector([1.0])),
 (0.51, DenseVector([-0.62])),
 (-1.58, DenseVector([1.45])),
 (-0.46, DenseVector([0.44])),
 (-0.49, DenseVector([0.37])),
 (0.31, DenseVector([-0.3])),
 (0.85, DenseVector([-0.77])),
 (0.61, DenseVector([-0.69])),
 (-0.66, DenseVector([0.65])),
 (0.99, DenseVector([-0.94])),
 (-0.73, DenseVector([0.62])),
 (0.43, DenseVector([-0.42])),
 (-0.5, DenseVector([0.35])),
 (-1.77, DenseVector([1.66])),
 (-0.65, DenseVector([0.88])),
 (-2.29, DenseVector([2.35])),
 (0.74, DenseVector([-0.66])),
 (0.44, DenseVector([-0.44])),
 (-0.41, DenseV

In [14]:
# Convert this RDD to a DataFrame
colNames = ["label", "features"]
df = data.toDF(colNames)
df.show()

+-----+--------+
|label|features|
+-----+--------+
|-1.74|  [1.66]|
| 1.24| [-1.18]|
| 0.29|  [-0.4]|
|-0.13|  [0.09]|
|-0.39|  [0.38]|
|-1.79|  [1.73]|
| 0.71| [-0.77]|
| 1.39| [-1.48]|
| 1.15| [-1.43]|
| 0.13| [-0.07]|
| 0.05| [-0.07]|
|  1.9|  [-1.8]|
| 1.48| [-1.42]|
| 0.32|  [-0.3]|
|-1.11|   [1.0]|
| 0.51| [-0.62]|
|-1.58|  [1.45]|
|-0.46|  [0.44]|
|-0.49|  [0.37]|
| 0.31|  [-0.3]|
+-----+--------+
only showing top 20 rows



In [16]:
trainTest = df.randomSplit([0.5, 0.5])
trainingDF = trainTest[0]
testDF = trainTest[1]

In [17]:
# Now create our linear regression model
lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Train the model using our training data
model = lir.fit(trainingDF)

In [19]:
# Now see if we can predict values in our test data.
# Generate predictions using our linear regression model for all features in our
# test dataframe:
fullPredictions = model.transform(testDF).cache()

In [20]:
# Extract the predictions and the "known" correct labels.
predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

In [21]:
# Zip them together
predictionAndLabel = predictions.zip(labels).collect()

In [22]:
# Print out the predicted and actual values for each point
for prediction in predictionAndLabel:
  print(prediction)

(-2.65901584070991, -3.74)
(-1.82125458917774, -2.58)
(-1.594065097236813, -2.26)
(-1.551467067497889, -2.17)
(-1.3455765904264236, -2.12)
(-1.3952742917885015, -2.09)
(-1.4449719931505796, -2.07)
(-1.3739752769190396, -1.94)
(-1.4023739634116557, -1.94)
(-1.295878889064346, -1.91)
(-1.4023739634116557, -1.87)
(-1.3029785606875, -1.8)
(-1.224882172832806, -1.79)
(-1.1751844714707282, -1.77)
(-1.19648348634019, -1.75)
(-1.1609851282244203, -1.66)
(-1.3029785606875, -1.64)
(-1.2177825012096521, -1.61)
(-1.1467857849781125, -1.59)
(-1.1680847998475743, -1.58)
(-1.1041877552391886, -1.57)
(-0.9763936660224167, -1.48)
(-1.0331910390076486, -1.47)
(-0.9976926808918786, -1.46)
(-0.9337956362834929, -1.4)
(-0.9337956362834929, -1.39)
(-0.8840979349214151, -1.37)
(-0.9976926808918786, -1.36)
(-0.8060015470667212, -1.3)
(-0.8344002335593372, -1.3)
(-0.8060015470667212, -1.29)
(-1.0402907106308026, -1.29)
(-0.8273005619361832, -1.26)
(-0.8485995768056451, -1.22)
(-0.8485995768056451, -1.2)
(-0.88

In [24]:
fullPredictions.collect()

[Row(label=-3.74, features=DenseVector([3.75]), prediction=-2.65901584070991),
 Row(label=-2.58, features=DenseVector([2.57]), prediction=-1.82125458917774),
 Row(label=-2.26, features=DenseVector([2.25]), prediction=-1.594065097236813),
 Row(label=-2.17, features=DenseVector([2.19]), prediction=-1.551467067497889),
 Row(label=-2.12, features=DenseVector([1.9]), prediction=-1.3455765904264236),
 Row(label=-2.09, features=DenseVector([1.97]), prediction=-1.3952742917885015),
 Row(label=-2.07, features=DenseVector([2.04]), prediction=-1.4449719931505796),
 Row(label=-1.94, features=DenseVector([1.94]), prediction=-1.3739752769190396),
 Row(label=-1.94, features=DenseVector([1.98]), prediction=-1.4023739634116557),
 Row(label=-1.91, features=DenseVector([1.83]), prediction=-1.295878889064346),
 Row(label=-1.87, features=DenseVector([1.98]), prediction=-1.4023739634116557),
 Row(label=-1.8, features=DenseVector([1.84]), prediction=-1.3029785606875),
 Row(label=-1.79, features=DenseVector([