In [2]:
import findspark
findspark.init('/usr/local/spark23/')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession \
        .builder \
        .appName('Gradient-Boosted-Tree') \
        .getOrCreate()

In [5]:
df = spark.read.format('csv') \
        .options(header='true', inferSchema='true') \
        .load('/home/hadoop/project/pydone/spark/dataset/Advertising.csv')

In [6]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [7]:
df.count()

200

In [8]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+
|summary|               _c0|               TV|             Radio|         Newspaper|             Sales|
+-------+------------------+-----------------+------------------+------------------+------------------+
|  count|               200|              200|               200|               200|               200|
|   mean|             100.5|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|57.879184513951124|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|                 1|              0.7|               0.0|               0.3|               1.6|
|    max|               200|            296.4|              49.6|             114.0|              27.0|
+-------+------------------+-----------------+------------------+------------------+------------------+



Convert the data to dense vector (features and label)

In [9]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

In [10]:
#function to convert data to dense vector

def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

In [11]:
##convert data to dense vector
transformed= transData(df)
transformed.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,230.1,37.8,6...| 22.1|
|[2.0,44.5,39.3,45.1]| 10.4|
|[3.0,17.2,45.9,69.3]|  9.3|
|[4.0,151.5,41.3,5...| 18.5|
|[5.0,180.8,10.8,5...| 12.9|
+--------------------+-----+
only showing top 5 rows



In [13]:
#split data
(trainingData, testData) = transformed.randomSplit([0.6, 0.4])

In [14]:
trainingData.show(5)
testData.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[4.0,151.5,41.3,5...| 18.5|
|[5.0,180.8,10.8,5...| 12.9|
| [6.0,8.7,48.9,75.0]|  7.2|
|   [9.0,8.6,2.1,1.0]|  4.8|
|[10.0,199.8,2.6,2...| 10.6|
+--------------------+-----+
only showing top 5 rows

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,230.1,37.8,6...| 22.1|
|[2.0,44.5,39.3,45.1]| 10.4|
|[3.0,17.2,45.9,69.3]|  9.3|
|[7.0,57.5,32.8,23.5]| 11.8|
|[8.0,120.2,19.6,1...| 13.2|
+--------------------+-----+
only showing top 5 rows



In [17]:
# import gbt 
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor()

In [18]:
#train model
model = gbt.fit(trainingData)

In [19]:
#make prediction
predictions = model.transform(testData)

In [20]:
#show prediction columns
predictions.columns

['features', 'label', 'prediction']

In [21]:
# Select few rows to display.
predictions.select("features","label","prediction").show()

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[1.0,230.1,37.8,6...| 22.1|21.253329294377615|
|[2.0,44.5,39.3,45.1]| 10.4| 8.865646051676359|
|[3.0,17.2,45.9,69.3]|  9.3|7.2118139550325555|
|[7.0,57.5,32.8,23.5]| 11.8| 8.601102731517921|
|[8.0,120.2,19.6,1...| 13.2|12.535456859159291|
|[12.0,214.7,24.0,...| 17.4|15.830376236508691|
| [14.0,97.5,7.6,7.2]|  9.7|10.499749531773967|
|[19.0,69.2,20.5,1...| 11.3|10.558131674696384|
|[21.0,218.4,27.7,...| 18.0|19.676421455890996|
|[22.0,237.4,5.1,2...| 12.5|12.482726185901605|
|[25.0,62.3,12.6,1...|  9.7| 8.701136399802087|
|[29.0,248.8,27.1,...| 18.9| 19.67306010951847|
|[32.0,112.9,17.4,...| 11.9|12.991110608684256|
|[33.0,97.2,1.5,30.0]|  9.6|  8.56741708756139|
| [35.0,95.7,1.4,7.4]|  9.5| 8.262491844315893|
|[38.0,74.7,49.4,4...| 14.7| 12.97589672898006|
|[39.0,43.1,26.7,3...| 10.1| 8.654229789173908|
|[42.0,177.0,33.4,...| 17.1| 18.92510196

In [22]:
#regresion evaluator

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("RMSE = ", rmse)

RMSE =  1.1696235306402385


In [23]:
#compare prediction label and datatest label
y_true = predictions.select("label").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))


r2_score: 0.9397059684644741
