In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [44]:
# load data
df = spark.read.format('com.databricks.spark.csv').\
                       options(header='true', \
                       inferschema='true').\
            load("Advertising.csv",header=True).drop('_c0')
df.show(5,True)
df.printSchema()
df.describe().show()

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows

root
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)

+-------+-----------------+------------------+------------------+------------------+
|summary|               TV|             Radio|         Newspaper|             Sales|
+-------+-----------------+------------------+------------------+------------------+
|  count|              200|               200|               200|               200|
|   mean|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|              0.7|             

In [45]:
df.select("TV").show(5)

+-----+
|   TV|
+-----+
|230.1|
| 44.5|
| 17.2|
|151.5|
|180.8|
+-----+
only showing top 5 rows



In [6]:
def get_dummy(df,indexCol,categoricalCols,continuousCols,labelCol,dropLast=False):

    '''
    Get dummy variables and concat with continuous variables for ml modeling.
    :param df: the dataframe
    :param categoricalCols: the name list of the categorical data
    :param continuousCols:  the name list of the numerical data
    :param labelCol:  the name of label column
    :param dropLast:  the flag of drop last column
    :return: feature matrix

    :author: Wenqiang Feng
    :email:  von198@gmail.com

    >>> df = spark.createDataFrame([
                  (0, "a"),
                  (1, "b"),
                  (2, "c"),
                  (3, "a"),
                  (4, "a"),
                  (5, "c")
              ], ["id", "category"])

    >>> indexCol = 'id'
    >>> categoricalCols = ['category']
    >>> continuousCols = []
    >>> labelCol = []

    >>> mat = get_dummy(df,indexCol,categoricalCols,continuousCols,labelCol)
    >>> mat.show()

    >>>
        +---+-------------+
        | id|     features|
        +---+-------------+
        |  0|[1.0,0.0,0.0]|
        |  1|[0.0,0.0,1.0]|
        |  2|[0.0,1.0,0.0]|
        |  3|[1.0,0.0,0.0]|
        |  4|[1.0,0.0,0.0]|
        |  5|[0.0,1.0,0.0]|
        +---+-------------+
    '''

    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.sql.functions import col

    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
                 for c in categoricalCols ]

    # default setting: dropLast=True
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()),dropLast=dropLast)
                 for indexer in indexers ]

    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(stages=indexers + encoders + [assembler])

    model=pipeline.fit(df)
    data = model.transform(df)

    if indexCol and labelCol:
        # for supervised learning
        data = data.withColumn('label',col(labelCol))
        return data.select(indexCol,'features','label')
    elif not indexCol and labelCol:
        # for supervised learning
        data = data.withColumn('label',col(labelCol))
        return data.select('features','label')
    elif indexCol and not labelCol:
        # for unsupervised learning
        return data.select(indexCol,'features')
    elif not indexCol and not labelCol:
        # for unsupervised learning
        return data.select('features')

In [8]:
# convert the data to dense vector
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

def transData(data):
    return data.rdd.map(lambda r: [r[-1], Vectors.dense(r[:-1])]).\
           toDF(['label','features'])

transformed = transData(df)
transformed.show(5)

+-----+-----------------+
|label|         features|
+-----+-----------------+
| 22.1|[230.1,37.8,69.2]|
| 10.4| [44.5,39.3,45.1]|
|  9.3| [17.2,45.9,69.3]|
| 18.5|[151.5,41.3,58.5]|
| 12.9|[180.8,10.8,58.4]|
+-----+-----------------+
only showing top 5 rows



In [11]:
# Deal with the Categorical variables
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4
# distinct values are treated as continuous.

featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)

data = featureIndexer.transform(transformed)
data.show(5)

+-----+-----------------+-----------------+
|label|         features|  indexedFeatures|
+-----+-----------------+-----------------+
| 22.1|[230.1,37.8,69.2]|[230.1,37.8,69.2]|
| 10.4| [44.5,39.3,45.1]| [44.5,39.3,45.1]|
|  9.3| [17.2,45.9,69.3]| [17.2,45.9,69.3]|
| 18.5|[151.5,41.3,58.5]|[151.5,41.3,58.5]|
| 12.9|[180.8,10.8,58.4]|[180.8,10.8,58.4]|
+-----+-----------------+-----------------+
only showing top 5 rows



In [12]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = transformed.randomSplit([0.6, 0.4])
trainingData.show(5)
testData.show(5)

+-----+---------------+
|label|       features|
+-----+---------------+
|  1.6| [0.7,39.6,8.7]|
|  3.2| [4.1,11.6,5.7]|
|  4.8|  [8.6,2.1,1.0]|
|  5.3|[13.1,0.4,25.6]|
|  5.7| [8.4,27.2,2.1]|
+-----+---------------+
only showing top 5 rows

+-----+----------------+
|label|        features|
+-----+----------------+
|  5.3|  [5.4,29.9,9.4]|
|  5.5| [7.3,28.1,41.4]|
|  5.6|[13.2,15.9,49.6]|
|  6.7|[18.7,12.1,23.4]|
|  7.2|[25.0,11.0,29.7]|
+-----+----------------+
only showing top 5 rows



## decision tree regressor

In [16]:
from pyspark.ml.regression import DecisionTreeRegressor

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures",labelCol='label')
#dt = DecisionTreeRegressor(featuresCol="features",labelCol='label')

In [18]:
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])
model = pipeline.fit(trainingData)

In [21]:
# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("features","label","prediction").show(5)

+----------------+-----+-----------------+
|        features|label|       prediction|
+----------------+-----+-----------------+
|  [5.4,29.9,9.4]|  5.3|7.033333333333334|
| [7.3,28.1,41.4]|  5.5|7.033333333333334|
|[13.2,15.9,49.6]|  5.6|5.599999999999999|
|[18.7,12.1,23.4]|  6.7|5.599999999999999|
|[25.0,11.0,29.7]|  7.2|5.599999999999999|
+----------------+-----+-----------------+
only showing top 5 rows



In [23]:
# evaluation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))

Root Mean Squared Error (RMSE) on test data = 1.21457
r2_score: 0.9353409382354951


In [24]:
# check feature importance
model.stages[1].featureImportances

SparseVector(3, {0: 0.6594, 1: 0.3269, 2: 0.0137})

## Random Forest Regression

In [25]:
data.show(5)
trainingData.show(5)
testData.show(5)

+-----+-----------------+-----------------+
|label|         features|  indexedFeatures|
+-----+-----------------+-----------------+
| 22.1|[230.1,37.8,69.2]|[230.1,37.8,69.2]|
| 10.4| [44.5,39.3,45.1]| [44.5,39.3,45.1]|
|  9.3| [17.2,45.9,69.3]| [17.2,45.9,69.3]|
| 18.5|[151.5,41.3,58.5]|[151.5,41.3,58.5]|
| 12.9|[180.8,10.8,58.4]|[180.8,10.8,58.4]|
+-----+-----------------+-----------------+
only showing top 5 rows

+-----+---------------+
|label|       features|
+-----+---------------+
|  1.6| [0.7,39.6,8.7]|
|  3.2| [4.1,11.6,5.7]|
|  4.8|  [8.6,2.1,1.0]|
|  5.3|[13.1,0.4,25.6]|
|  5.7| [8.4,27.2,2.1]|
+-----+---------------+
only showing top 5 rows

+-----+----------------+
|label|        features|
+-----+----------------+
|  5.3|  [5.4,29.9,9.4]|
|  5.5| [7.3,28.1,41.4]|
|  5.6|[13.2,15.9,49.6]|
|  6.7|[18.7,12.1,23.4]|
|  7.2|[25.0,11.0,29.7]|
+-----+----------------+
only showing top 5 rows



In [33]:
# Import RandomForestRegressor
from pyspark.ml.regression import RandomForestRegressor

# Define RandomForestRegressor
rf = RandomForestRegressor(featuresCol="indexedFeatures", numTrees=100) # featuresCol="indexedFeatures",numTrees=2, maxDepth=2, seed=42

In [34]:
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
rfmodel = pipeline.fit(trainingData)

rfpredictions = rfmodel.transform(testData)

# Select example rows to display.
rfpredictions.select("features","label", "prediction").show(5)

+----------------+-----+-----------------+
|        features|label|       prediction|
+----------------+-----+-----------------+
|  [5.4,29.9,9.4]|  5.3|8.629162991753926|
| [7.3,28.1,41.4]|  5.5| 9.21473664048392|
|[13.2,15.9,49.6]|  5.6|8.601744734138068|
|[18.7,12.1,23.4]|  6.7|8.531689803043868|
|[25.0,11.0,29.7]|  7.2| 8.83378249789345|
+----------------+-----+-----------------+
only showing top 5 rows



In [35]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(rfpredictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

y_true = rfpredictions.select("label").toPandas()
y_pred = rfpredictions.select("prediction").toPandas()
import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {:4.3f}'.format(r2_score))

#feature importance
print(model.stages[-1].featureImportances)


Root Mean Squared Error (RMSE) on test data = 1.86435
r2_score: 0.848
(3,[0,1,2],[0.6593810382832179,0.32688190371226084,0.013737058004521293])


In [37]:
rfmodel.stages[-1].trees

[DecisionTreeRegressionModel (uid=dtr_d5bd15d8bc14) of depth 5 with 43 nodes,
 DecisionTreeRegressionModel (uid=dtr_b3feb87d2e57) of depth 5 with 43 nodes,
 DecisionTreeRegressionModel (uid=dtr_c6da2ec3b018) of depth 5 with 57 nodes,
 DecisionTreeRegressionModel (uid=dtr_8a54e3a98911) of depth 5 with 47 nodes,
 DecisionTreeRegressionModel (uid=dtr_372481cbad79) of depth 5 with 51 nodes,
 DecisionTreeRegressionModel (uid=dtr_ee82c19f1172) of depth 5 with 55 nodes,
 DecisionTreeRegressionModel (uid=dtr_852177c1a3e4) of depth 5 with 45 nodes,
 DecisionTreeRegressionModel (uid=dtr_fbdbf549037f) of depth 5 with 49 nodes,
 DecisionTreeRegressionModel (uid=dtr_3bb86d1a678a) of depth 5 with 57 nodes,
 DecisionTreeRegressionModel (uid=dtr_5d74d9574bfa) of depth 5 with 43 nodes,
 DecisionTreeRegressionModel (uid=dtr_86c6e91582b7) of depth 5 with 37 nodes,
 DecisionTreeRegressionModel (uid=dtr_e904f315a28e) of depth 5 with 53 nodes,
 DecisionTreeRegressionModel (uid=dtr_f7934fb0f64c) of depth 5 w

## Gradient-boosted tree regression

In [38]:
data.show(5)
trainingData.show(5)
testData.show(5)

+-----+-----------------+-----------------+
|label|         features|  indexedFeatures|
+-----+-----------------+-----------------+
| 22.1|[230.1,37.8,69.2]|[230.1,37.8,69.2]|
| 10.4| [44.5,39.3,45.1]| [44.5,39.3,45.1]|
|  9.3| [17.2,45.9,69.3]| [17.2,45.9,69.3]|
| 18.5|[151.5,41.3,58.5]|[151.5,41.3,58.5]|
| 12.9|[180.8,10.8,58.4]|[180.8,10.8,58.4]|
+-----+-----------------+-----------------+
only showing top 5 rows

+-----+---------------+
|label|       features|
+-----+---------------+
|  1.6| [0.7,39.6,8.7]|
|  3.2| [4.1,11.6,5.7]|
|  4.8|  [8.6,2.1,1.0]|
|  5.3|[13.1,0.4,25.6]|
|  5.7| [8.4,27.2,2.1]|
+-----+---------------+
only showing top 5 rows

+-----+----------------+
|label|        features|
+-----+----------------+
|  5.3|  [5.4,29.9,9.4]|
|  5.5| [7.3,28.1,41.4]|
|  5.6|[13.2,15.9,49.6]|
|  6.7|[18.7,12.1,23.4]|
|  7.2|[25.0,11.0,29.7]|
+-----+----------------+
only showing top 5 rows



In [40]:
# Import GBTRegressor class
from pyspark.ml.regression import GBTRegressor

# Define LinearRegression algorithm
gbt = GBTRegressor(featuresCol="indexedFeatures") #numTrees=2, maxDepth=2, seed=42

In [41]:
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, gbt])
gbtmodel = pipeline.fit(trainingData)

gbtpredictions = model.transform(testData)
# Select example rows to display.
gbtpredictions.select("features","label", "prediction").show(5)

+----------------+-----+-----------------+
|        features|label|       prediction|
+----------------+-----+-----------------+
|  [5.4,29.9,9.4]|  5.3|7.033333333333334|
| [7.3,28.1,41.4]|  5.5|7.033333333333334|
|[13.2,15.9,49.6]|  5.6|5.599999999999999|
|[18.7,12.1,23.4]|  6.7|5.599999999999999|
|[25.0,11.0,29.7]|  7.2|5.599999999999999|
+----------------+-----+-----------------+
only showing top 5 rows



In [43]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(gbtpredictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

y_true = gbtpredictions.select("label").toPandas()
y_pred = gbtpredictions.select("prediction").toPandas()
import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {:4.3f}'.format(r2_score))

#feature importance
print(model.stages[-1].featureImportances)

gbtmodel.stages[-1].trees

Root Mean Squared Error (RMSE) on test data = 1.21457
r2_score: 0.935
(3,[0,1,2],[0.6593810382832179,0.32688190371226084,0.013737058004521293])


[DecisionTreeRegressionModel (uid=dtr_004e1b87cfc3) of depth 5 with 63 nodes,
 DecisionTreeRegressionModel (uid=dtr_db551e0d5533) of depth 5 with 29 nodes,
 DecisionTreeRegressionModel (uid=dtr_1fc6e9e75426) of depth 5 with 29 nodes,
 DecisionTreeRegressionModel (uid=dtr_60af44e46472) of depth 5 with 35 nodes,
 DecisionTreeRegressionModel (uid=dtr_66ec86d2ba2b) of depth 5 with 35 nodes,
 DecisionTreeRegressionModel (uid=dtr_5459a18679c2) of depth 5 with 47 nodes,
 DecisionTreeRegressionModel (uid=dtr_984a843e0e5c) of depth 5 with 45 nodes,
 DecisionTreeRegressionModel (uid=dtr_f2c211a42219) of depth 5 with 45 nodes,
 DecisionTreeRegressionModel (uid=dtr_ba98405e0ce1) of depth 5 with 45 nodes,
 DecisionTreeRegressionModel (uid=dtr_74df0d570125) of depth 5 with 43 nodes,
 DecisionTreeRegressionModel (uid=dtr_a6d5bf960893) of depth 5 with 43 nodes,
 DecisionTreeRegressionModel (uid=dtr_30603aa11f13) of depth 5 with 33 nodes,
 DecisionTreeRegressionModel (uid=dtr_72a44624edd4) of depth 5 w