In [0]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.sql.functions import col
from pyspark.sql.functions import year
from pyspark.sql.functions import to_date
from pyspark.sql.functions import month
from pyspark.sql.functions import hour
from pyspark.sql.functions import lit
from pyspark.sql.functions import date_format


In [0]:
#
#
# Loading data and performing transformations
#
#

In [0]:
# Load data from parquet (dftaxis from part 1)
df_taxis_ml = spark.read.parquet('/mnt/output/dftaxis')

In [0]:
# Include additonal fields

In [0]:
df_taxis_ml = df_taxis_ml.withColumn("week_day", date_format(col("lpep_dropoff_datetime"), "EEEE"))


In [0]:
df_taxis_ml = df_taxis_ml.withColumn("trip_month", month(df_taxis_ml.lpep_dropoff_datetime))


In [0]:
df_taxis_ml = df_taxis_ml.withColumn("trip_hour", hour(df_taxis_ml.lpep_dropoff_datetime))


In [0]:
# Create apr 2022 dataframe - For Final predictions 
apr_2022_df = df_taxis_ml.where(df_taxis_ml.lpep_dropoff_datetime >= '2022-04-01T00:00:00.000+0000')


In [0]:
# Create list for numerical columns that will be used

num_col = ['VendorID', 'RatecodeID', 'PULocationID', 'DOLocationID','passenger_count','trip_distance', 'Duration', 'Speed', 'trip_month',
 'trip_hour']


In [0]:
# Create list all columns that will be part of the model

cols_list = ['VendorID', 'RatecodeID', 'PULocationID', 'DOLocationID','passenger_count','trip_distance', 'Duration', 'Speed', 'trip_month',
 'trip_hour', 'taxi_colour','week_day', 'total_amount']

In [0]:
# Create empty list stages.

stages = []


In [0]:
# Drop null values from selected columns
apr_2022_df = apr_2022_df.na.drop(subset=['VendorID', 'RatecodeID', 'PULocationID', 'DOLocationID','passenger_count','trip_distance', 'Duration', 'Speed', 'trip_month', 'trip_hour', 'taxi_colour','week_day', 'total_amount'])


In [0]:
# Create categorical columns list 

cat_cols = ['taxi_colour','week_day'] 

In [0]:
# Iterate through cat_cols - instantiate StringIndexer and OneHotEncoder for each column - add them to stages
for cat_col in cat_cols:
    col_indexer = StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}_ind")
    col_encoder = OneHotEncoder(inputCols=[f"{cat_col}_ind"], outputCols=[f"{cat_col}_ohe"])
    stages += [col_indexer, col_encoder]

In [0]:
# Create new list cat_cols_ohe adding suffix _ohe to each element of cat_cols
cat_cols_ohe = [f"{cat_col}_ohe" for cat_col in cat_cols]

In [0]:
#Instantiate a VectorAssembler 
vector_assembler = VectorAssembler(inputCols=cat_cols_ohe + num_col, outputCol="features")

In [0]:
# Add vector assembler to stages
stages += [vector_assembler]


In [0]:
# Instantiate a pipeline with stages
pipeline = Pipeline(stages=stages)


In [0]:
# Fit the pipeline with the model df
pipeline_model = pipeline.fit(apr_2022_df)


In [0]:
#Apply pipeline to the model df
apr_2022_df = pipeline_model.transform(apr_2022_df)

In [0]:
# See model df with features and label
apr_2022_df = apr_2022_df.select(['features'] + cols_list)
apr_2022_df.show()

+--------------------+--------+----------+------------+------------+---------------+-------------+-------------------+------------------+----------+---------+-----------+---------+------------+
|            features|VendorID|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|           Duration|             Speed|trip_month|trip_hour|taxi_colour| week_day|total_amount|
+--------------------+--------+----------+------------+------------+---------------+-------------+-------------------+------------------+----------+---------+-----------+---------+------------+
|[1.0,0.0,0.0,0.0,...|       2|       1.0|         234|         246|            4.0|         2.26|0.17277777777777778| 13.08038585209003|         4|        2|     Yellow|  Tuesday|       16.56|
|[1.0,0.0,0.0,0.0,...|       2|       1.0|         246|         163|            3.0|         1.69|0.11472222222222223|14.731234866828085|         4|        2|     Yellow|  Tuesday|       13.56|
|[1.0,0.0,1.0,0.0,...|       2

In [0]:
### Write delta table for apr 2022

apr_2022_df.write.format("delta").save('/dbfs/predtest')


In [0]:
# Read APR 2022 set from delta table
apr_2022_df = spark.read.format("delta").load('/dbfs/predtest')

In [0]:
# Assign label and features to fit the model

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'total_amount')

In [0]:
# FIt the decission tree to the apr 2022 data.
dt_model = dt.fit(apr_2022_df)

In [0]:
# Predict total amount for apr 2022 trips.
dt_apr_predictions = dt_model.transform(apr_2022_df)

In [0]:
# See predictions
dt_apr_predictions.show()

+--------------------+--------+----------+------------+------------+---------------+-------------+-------------------+------------------+----------+---------+-----------+--------+------------+------------------+
|            features|VendorID|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|           Duration|             Speed|trip_month|trip_hour|taxi_colour|week_day|total_amount|        prediction|
+--------------------+--------+----------+------------+------------+---------------+-------------+-------------------+------------------+----------+---------+-----------+--------+------------+------------------+
|[1.0,0.0,0.0,1.0,...|       2|       1.0|         229|         211|            5.0|         3.49|0.25472222222222224|13.701199563794983|         4|       22|     Yellow|Thursday|       20.76| 20.45931810470086|
|[1.0,0.0,0.0,1.0,...|       2|       1.0|         211|         113|            4.0|         0.79|0.07027777777777777| 11.24110671936759|         4|    

In [0]:
# Evaluate model
dt_evaluate_apr = RegressionEvaluator(
    labelCol="total_amount", predictionCol="prediction", metricName="rmse")

In [0]:
#Extract RMSE.
rmse_apr = dt_evaluate_apr.evaluate(dt_apr_predictions)

In [0]:
print("Root Mean Squared Error (RMSE) on train data = %g" % rmse_apr)

Root Mean Squared Error (RMSE) on train data = 6.48655
