# Decision Tree Regression Spark

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Set up the environment for using pyspark
import findspark

findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Decision Tree Example").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

## Load Data

In [None]:
# Read the dataset file which is in csv - comma separated values format
sdf = spark.read.format('csv').options(header='true', inferSchema='true').load('../datasets/Position_Salaries.csv')

In [None]:
sdf.show()

In [None]:
from pyspark.ml.feature import VectorAssembler
vassemb = VectorAssembler(inputCols = ['Level'], outputCol = 'features')
ndf = vassemb.transform(sdf)
ndf = ndf.select(['features', 'Salary'])
ndf.show(3)

In [None]:
ndf.printSchema()

## Decision Tree Regressor

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

In [None]:
(trainingData, testData) = ndf.randomSplit([0.7, 0.3], seed = 2345)

In [None]:
# Create an initial Decision Tree Model
dt = DecisionTreeRegressor(labelCol="Salary", featuresCol="features")

In [None]:
dt_model = dt.fit(trainingData)

In [None]:
dt_predictions = dt_model.transform(testData)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
dt_evaluator = RegressionEvaluator(
    labelCol="Salary", predictionCol="prediction", metricName="rmse")

In [None]:
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [None]:
lr_predictions = dt_model.transform(testData)
lr_predictions.select("prediction","Salary","features").show(5)