In [24]:
import findspark
findspark.init('/usr/local/spark23/')

In [25]:
from pyspark.sql import SparkSession

In [26]:
spark = SparkSession \
        .builder \
        .appName('Gradient-Boosted-Tree') \
        .getOrCreate()

In [27]:
df = spark.read.format('csv') \
        .options(header='true', inferSchema='true') \
        .load('/home/hadoop/project/pydone/spark/dataset/Advertising.csv')

In [28]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [29]:
df.count()

200

In [30]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+
|summary|               _c0|               TV|             Radio|         Newspaper|             Sales|
+-------+------------------+-----------------+------------------+------------------+------------------+
|  count|               200|              200|               200|               200|               200|
|   mean|             100.5|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|57.879184513951124|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|                 1|              0.7|               0.0|               0.3|               1.6|
|    max|               200|            296.4|              49.6|             114.0|              27.0|
+-------+------------------+-----------------+------------------+------------------+------------------+



Convert the data to dense vector (features and label)

In [31]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [32]:
#function to convert data to dense vector

def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

In [33]:
##convert data to dense vector
transformed= transData(df)
transformed.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,230.1,37.8,6...| 22.1|
|[2.0,44.5,39.3,45.1]| 10.4|
|[3.0,17.2,45.9,69.3]|  9.3|
|[4.0,151.5,41.3,5...| 18.5|
|[5.0,180.8,10.8,5...| 12.9|
+--------------------+-----+
only showing top 5 rows



In [34]:
#split data
(trainingData, testData) = transformed.randomSplit([0.6, 0.4])

In [35]:
trainingData.show(5)
testData.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,230.1,37.8,6...| 22.1|
|[3.0,17.2,45.9,69.3]|  9.3|
|[4.0,151.5,41.3,5...| 18.5|
|[5.0,180.8,10.8,5...| 12.9|
| [6.0,8.7,48.9,75.0]|  7.2|
+--------------------+-----+
only showing top 5 rows

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[2.0,44.5,39.3,45.1]| 10.4|
|[7.0,57.5,32.8,23.5]| 11.8|
|[8.0,120.2,19.6,1...| 13.2|
|[16.0,195.4,47.7,...| 22.4|
|[23.0,13.2,15.9,4...|  5.6|
+--------------------+-----+
only showing top 5 rows



In [36]:
# import gbt 
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor()

In [37]:
#train model
model = gbt.fit(trainingData)

In [38]:
#make prediction
predictions = model.transform(testData)

In [39]:
#show prediction columns
predictions.columns

['features', 'label', 'prediction']

In [40]:
# Select few rows to display.
predictions.select("features","label","prediction").show()

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[2.0,44.5,39.3,45.1]| 10.4|14.379826704342744|
|[7.0,57.5,32.8,23.5]| 11.8|12.848286218158684|
|[8.0,120.2,19.6,1...| 13.2|13.114437015214182|
|[16.0,195.4,47.7,...| 22.4|24.269452680239983|
|[23.0,13.2,15.9,4...|  5.6| 5.628785290241136|
|[30.0,70.6,16.0,4...| 10.5|10.502066905437204|
|[31.0,292.9,28.3,...| 21.4|18.641254954748042|
|[33.0,97.2,1.5,30.0]|  9.6|10.713785006679453|
|[36.0,290.7,4.1,8.5]| 12.8|  13.0980007953511|
|[41.0,202.5,22.3,...| 16.6|15.793327469616942|
|[42.0,177.0,33.4,...| 17.1|14.939945349631397|
|[43.0,293.6,27.7,...| 20.7|18.145196022142954|
|[44.0,206.9,8.4,2...| 12.9|11.787802948974255|
|[45.0,25.1,25.7,4...|  8.5| 6.711110391027804|
|[46.0,175.1,22.5,...| 14.9|  13.8939849673923|
|[49.0,227.2,15.8,...| 14.8|15.185048166354944|
|[50.0,66.9,11.7,3...|  9.7|10.267455235128368|
|[54.0,182.6,46.2,...| 21.2|17.948563730

In [41]:
#regresion evaluator

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("RMSE = ", rmse)

RMSE =  1.4904998896228807


In [42]:
#compare prediction label and datatest label
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))


r2_score: 0.9201903939689893
