In [16]:
import findspark
findspark.init('/usr/local/spark23/')

In [17]:
from pyspark.sql import SparkSession

In [18]:
spark = SparkSession \
        .builder \
        .appName('decisiontree-regression') \
        .getOrCreate()

In [19]:
df = spark.read.format('csv') \
        .options(header='true', inferSchema='true') \
        .load('/home/hadoop/project/pydone/spark/dataset/Advertising.csv')

In [20]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [21]:
df.count()

200

In [22]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+
|summary|               _c0|               TV|             Radio|         Newspaper|             Sales|
+-------+------------------+-----------------+------------------+------------------+------------------+
|  count|               200|              200|               200|               200|               200|
|   mean|             100.5|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|57.879184513951124|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|                 1|              0.7|               0.0|               0.3|               1.6|
|    max|               200|            296.4|              49.6|             114.0|              27.0|
+-------+------------------+-----------------+------------------+------------------+------------------+



Convert the data to dense vector (features and label)

In [23]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [24]:
#function to convert data to dense vector

def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

In [25]:
##convert data to dense vector
transformed= transData(df)
transformed.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,230.1,37.8,6...| 22.1|
|[2.0,44.5,39.3,45.1]| 10.4|
|[3.0,17.2,45.9,69.3]|  9.3|
|[4.0,151.5,41.3,5...| 18.5|
|[5.0,180.8,10.8,5...| 12.9|
+--------------------+-----+
only showing top 5 rows



In [26]:
#split data
(trainingData, testData) = transformed.randomSplit([0.6, 0.4])

In [27]:
trainingData.show(5)
testData.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,230.1,37.8,6...| 22.1|
|[3.0,17.2,45.9,69.3]|  9.3|
|[5.0,180.8,10.8,5...| 12.9|
|[7.0,57.5,32.8,23.5]| 11.8|
|[8.0,120.2,19.6,1...| 13.2|
+--------------------+-----+
only showing top 5 rows

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[2.0,44.5,39.3,45.1]| 10.4|
|[4.0,151.5,41.3,5...| 18.5|
| [6.0,8.7,48.9,75.0]|  7.2|
|   [9.0,8.6,2.1,1.0]|  4.8|
|[11.0,66.1,5.8,24.2]|  8.6|
+--------------------+-----+
only showing top 5 rows



In [31]:
# import decision tree library
from pyspark.ml.regression import DecisionTreeRegressor

dc = DecisionTreeRegressor()

In [32]:
#train model
model = dc.fit(trainingData)

In [33]:
#make prediction
predictions = model.transform(testData)

In [34]:
#show prediction columns
predictions.columns

['features', 'label', 'prediction']

In [35]:
# Select few rows to display.
predictions.select("features","label","prediction").show()

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[2.0,44.5,39.3,45.1]| 10.4|              9.45|
|[4.0,151.5,41.3,5...| 18.5|            20.125|
| [6.0,8.7,48.9,75.0]|  7.2|               5.5|
|   [9.0,8.6,2.1,1.0]|  4.8|               5.5|
|[11.0,66.1,5.8,24.2]|  8.6|               9.4|
|[13.0,23.8,35.1,6...|  9.2| 8.624999999999998|
| [14.0,97.5,7.6,7.2]|  9.7|               9.4|
|[19.0,69.2,20.5,1...| 11.3|12.036363636363633|
|[22.0,237.4,5.1,2...| 12.5|11.871428571428572|
|[23.0,13.2,15.9,4...|  5.6|               5.5|
|[25.0,62.3,12.6,1...|  9.7|12.036363636363633|
|[26.0,262.9,3.5,1...| 12.0|11.871428571428572|
|[28.0,240.1,16.7,...| 15.9|             15.88|
|[29.0,248.8,27.1,...| 18.9|             20.85|
|[31.0,292.9,28.3,...| 21.4|             20.85|
|[32.0,112.9,17.4,...| 11.9|12.036363636363633|
|[33.0,97.2,1.5,30.0]|  9.6|               9.4|
|[36.0,290.7,4.1,8.5]| 12.8|11.871428571

In [36]:
#regresion evaluator

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("RMSE = ", rmse)

RMSE =  1.4588326224649573


In [37]:
#compare prediction label and datatest label
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))


r2_score: 0.9039338084569278
