In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/03 11:48:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = spark.read.format('csv').\
                       options(header='true', \
                       inferschema='true').\
            load("data/Advertising.csv",header=True);

In [3]:
df.show(5,True)
df.printSchema()

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [4]:
df.describe().show()

+-------+-----------------+------------------+------------------+------------------+
|summary|               TV|             Radio|         Newspaper|             Sales|
+-------+-----------------+------------------+------------------+------------------+
|  count|              200|               200|               200|               200|
|   mean|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|              0.7|               0.0|               0.3|               1.6|
|    max|            296.4|              49.6|             114.0|              27.0|
+-------+-----------------+------------------+------------------+------------------+



## Convert the data to dense vector (features and label)

In [5]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

transformed = df.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-1])]).toDF(['label','features'])
transformed.show(5)


                                                                                

+-----+-----------------+
|label|         features|
+-----+-----------------+
| 22.1|[230.1,37.8,69.2]|
| 10.4| [44.5,39.3,45.1]|
|  9.3| [17.2,45.9,69.3]|
| 18.5|[151.5,41.3,58.5]|
| 12.9|[180.8,10.8,58.4]|
+-----+-----------------+
only showing top 5 rows



[Stage 7:>                                                          (0 + 1) / 1]                                                                                

You will find out that all of the machine learning algorithms in Spark are based on the features and label. That is to say, you can play with all of the machine learning algorithms in Spark when you get ready the features and label.

Deal with categorical variables

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4
# distinct values are treated as continuous.

featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)

data = featureIndexer.transform(transformed)

                                                                                

In [7]:
data.show(5)

+-----+-----------------+-----------------+
|label|         features|  indexedFeatures|
+-----+-----------------+-----------------+
| 22.1|[230.1,37.8,69.2]|[230.1,37.8,69.2]|
| 10.4| [44.5,39.3,45.1]| [44.5,39.3,45.1]|
|  9.3| [17.2,45.9,69.3]| [17.2,45.9,69.3]|
| 18.5|[151.5,41.3,58.5]|[151.5,41.3,58.5]|
| 12.9|[180.8,10.8,58.4]|[180.8,10.8,58.4]|
+-----+-----------------+-----------------+
only showing top 5 rows



Fit Decision Tree Regression Model

In [8]:
from pyspark.ml.regression import DecisionTreeRegressor

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

In [9]:
# split data into training and test datasets
trainingData, testData = data.randomSplit([0.8, 0.2], seed=1234)
trainingData.show(5)


[Stage 11:>                                                         (0 + 1) / 1]

+-----+----------------+----------------+
|label|        features| indexedFeatures|
+-----+----------------+----------------+
|  1.6|  [0.7,39.6,8.7]|  [0.7,39.6,8.7]|
|  4.8|   [8.6,2.1,1.0]|   [8.6,2.1,1.0]|
|  5.3|  [5.4,29.9,9.4]|  [5.4,29.9,9.4]|
|  5.5| [7.3,28.1,41.4]| [7.3,28.1,41.4]|
|  5.6|[13.2,15.9,49.6]|[13.2,15.9,49.6]|
+-----+----------------+----------------+
only showing top 5 rows



                                                                                

Pipeline Architecture

In [10]:
# Chain indexer and decision tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

model = pipeline.fit(trainingData)

Make predictions

In [21]:
# Make predictions.
predictions = model.transform(testData)

In [23]:
predictions.show(5)

+-----+----------------+----------------+-----------------+
|label|        features| indexedFeatures|       prediction|
+-----+----------------+----------------+-----------------+
|  3.2|  [4.1,11.6,5.7]|  [4.1,11.6,5.7]|              5.7|
|  5.3| [13.1,0.4,25.6]| [13.1,0.4,25.6]|5.466666666666666|
|  8.1| [53.5,2.0,21.4]| [53.5,2.0,21.4]|              8.4|
|  8.6| [66.1,5.8,24.2]| [66.1,5.8,24.2]|             10.1|
|  8.7|[16.9,43.7,89.4]|[16.9,43.7,89.4]|             8.96|
+-----+----------------+----------------+-----------------+
only showing top 5 rows



[Stage 46:>                                                         (0 + 1) / 1]                                                                                

In [24]:
# Select example rows to display.
predictions.select("features","label","prediction").show(5)

+----------------+-----+-----------------+
|        features|label|       prediction|
+----------------+-----+-----------------+
|  [4.1,11.6,5.7]|  3.2|              5.7|
| [13.1,0.4,25.6]|  5.3|5.466666666666666|
| [53.5,2.0,21.4]|  8.1|              8.4|
| [66.1,5.8,24.2]|  8.6|             10.1|
|[16.9,43.7,89.4]|  8.7|             8.96|
+----------------+-----+-----------------+
only showing top 5 rows



Evaluation

In [25]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 1.24168


[Stage 48:>                                                         (0 + 1) / 1]                                                                                

In [26]:
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))


r2_score: 0.9454161819062926
