In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
spark = SparkSession.builder.appName('taxi') \
        .appName('taxi') \
        .getOrCreate()

In [4]:
data = 'gs://dataproc-staging-us-central1-100317476977-28dz6rb8/data/yellow_tripdata_2019-01.csv'
df = spark.read.csv(data, header=True, inferSchema=True)
df = df.select('passenger_count', 'pulocationid', 'dolocationid', 'total_amount')
df.show(10)

                                                                                

+---------------+------------+------------+------------+
|passenger_count|pulocationid|dolocationid|total_amount|
+---------------+------------+------------+------------+
|              1|         151|         239|        9.95|
|              1|         239|         246|        16.3|
|              3|         236|         236|         5.8|
|              5|         193|         193|        7.55|
|              5|         193|         193|       55.55|
|              5|         193|         193|       13.31|
|              5|         193|         193|       55.55|
|              1|         163|         229|        9.05|
|              1|         229|           7|        18.5|
|              2|         141|         234|        13.0|
+---------------+------------+------------+------------+
only showing top 10 rows



In [5]:
trainDF, testDF = df.randomSplit([0.8, 0.2], seed=42)
vectorAssembler = VectorAssembler(inputCols=['passenger_count', 'pulocationid', 'dolocationid'], outputCol='features')
vecTrainDF = vectorAssembler.transform(trainDF)

In [6]:
reg = DecisionTreeRegressor(featuresCol='features', labelCol='total_amount')
reg.setMaxBins(32)
model = reg.fit(vecTrainDF)

                                                                                

In [7]:
pipeline = Pipeline(stages=[vectorAssembler, reg])
pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
predDF.select('passenger_count', 'pulocationid', 'dolocationid', 'prediction').show(10)


[Stage 31:>                                                         (0 + 1) / 1]

+---------------+------------+------------+------------------+
|passenger_count|pulocationid|dolocationid|        prediction|
+---------------+------------+------------+------------------+
|              0|           1|           1| 17.06950532338697|
|              0|           4|           4| 17.06950532338697|
|              0|           4|           4| 17.06950532338697|
|              0|           4|          68|17.044953350547864|
|              0|           4|          79|17.044953350547864|
|              0|           4|          90|17.044953350547864|
|              0|           4|         107|17.044953350547864|
|              0|           4|         144|17.044953350547864|
|              0|           4|         232|17.044953350547864|
|              0|           4|         233|17.044953350547864|
+---------------+------------+------------+------------------+
only showing top 10 rows



                                                                                

In [8]:
regressorEvaluator = RegressionEvaluator(
    predictionCol='prediction',
    labelCol='total_amount',
    metricName='rmse'
    )
rmse = regressorEvaluator.evaluate(predDF)
print(rmse)



504.11353655916656


                                                                                