In [1]:
from __future__ import print_function

from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

In [2]:
# Create a SparkSession (Note, the config section is only for Windows!)
spark = SparkSession.builder.appName("DecisionTree").getOrCreate()

In [3]:
# Load up data as dataframe
data = spark.read.csv("realestate.csv",header=True,inferSchema=True)

assembler = VectorAssembler().setInputCols(["HouseAge", "DistanceToMRT", \
                           "NumberConvenienceStores"]).setOutputCol("features")

df = assembler.transform(data).select("PriceOfUnitArea", "features")

# Let's split our data into training data and testing data
trainTest = df.randomSplit([0.5, 0.5])
trainingDF = trainTest[0]
testDF = trainTest[1]

In [4]:
df.show()

+---------------+--------------------+
|PriceOfUnitArea|            features|
+---------------+--------------------+
|           37.9|[32.0,84.87882,10.0]|
|           42.2| [19.5,306.5947,9.0]|
|           47.3| [13.3,561.9845,5.0]|
|           54.8| [13.3,561.9845,5.0]|
|           43.1|  [5.0,390.5684,5.0]|
|           32.1|   [7.1,2175.03,3.0]|
|           40.3| [34.5,623.4731,7.0]|
|           46.7| [20.3,287.6025,6.0]|
|           18.8| [31.7,5512.038,1.0]|
|           22.1|  [17.9,1783.18,3.0]|
|           41.4| [34.8,405.2134,1.0]|
|           58.1|  [6.3,90.45606,9.0]|
|           39.3| [13.0,492.2313,5.0]|
|           23.8| [20.4,2469.645,4.0]|
|           34.3| [13.2,1164.838,4.0]|
|           50.5| [35.7,579.2083,2.0]|
|           70.1|  [0.0,292.9978,6.0]|
|           37.4| [17.7,350.8515,1.0]|
|           42.3| [16.9,368.1363,8.0]|
|           47.7|  [1.5,23.38284,7.0]|
+---------------+--------------------+
only showing top 20 rows



In [5]:
# Now create our decision tree
dtr = DecisionTreeRegressor().setFeaturesCol("features").setLabelCol("PriceOfUnitArea")

# Train the model using our training data
model = dtr.fit(trainingDF)