In [0]:
# df_ride_info = spark.read \
# .parquet("s3://dsoaws/nyc-taxi-orig-cleaned-split-parquet-per-year-multiple-files/ride-info/") 

# df_ride_fare = spark.read \
#   .parquet("s3://dsoaws/nyc-taxi-orig-cleaned-split-parquet-per-year-multiple-files/ride-fare/")

# df_model_training = df_ride_info.join(df_ride_fare, on="ride_id") \
#                                 .drop(df_ride_fare.ride_id) \
#                                 .drop(df_ride_fare.year)

# df_model_training = df_model_training \
#   .drop("ride_id") \
#   .drop("pickup_at") \
#   .drop("dropoff_at") \
#   .drop("store_and_fwd_flag")

# df_train, df_test = df_model_training.randomSplit([0.70, 0.30], seed = 0)

df_train = spark.read.option("recursiveFileLookup", "true").parquet('s3://dsoaws/gsml-nyc-taxi-full-etl-ml-test-4-custompyspark-export-s3-via-notebook/export-flow-2023-03-02-03-32-10-53926e35/output/training/')

df_test = spark.read.option("recursiveFileLookup", "true").parquet('s3://dsoaws/gsml-nyc-taxi-full-etl-ml-test-4-custompyspark-export-s3-via-notebook/export-flow-2023-03-02-03-32-10-53926e35/output/validation/')

print("There are %d training and %d test examples." % (df_train.count(), df_test.count()))  

There are 1054038765 training and 1054038765 test examples.


In [0]:
# adjust for too much data above ^^
# df_train, _ = df_train.randomSplit([0.70, 0.30], seed = 0)

# _, df_test = df_test.randomSplit([0.70, 0.30], seed = 0)

# print("There are %d training and %d test examples." % (df_train.count(), df_test.count()))  

In [0]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline
import xgboost
from xgboost.spark import SparkXGBRegressor

featuresCols = df_train.columns
featuresCols.remove('total_amount')

vectorAssembler = VectorAssembler(inputCols=featuresCols, 
                                  outputCol="rawFeatures", 
                                  handleInvalid="skip")

vectorIndexer = VectorIndexer(inputCol="rawFeatures", 
                              outputCol="features", 
                              maxCategories=100, 
                              handleInvalid="skip")

xgb_regressor = SparkXGBRegressor(num_workers=480, # adjust this
                                  label_col="total_amount", 
                                  missing=0.0,
                                  eta=0.2,
                                  gamma=4,
                                  max_depth=5,
                                  min_child_weight=6,
                                  num_round=50,
                                  objective='reg:squarederror',
                                  subsample=0.7)

pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, xgb_regressor])      

# m5.24xlarge (6 incl leader node) - 4 minutes
pipelineModel = pipeline.fit(df_train)

In [0]:
featuresCols

Out[7]: ['ride_id_0',
 'passenger_count',
 'trip_distance',
 'rate_code_id',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount']

## Predict

In [0]:
predictions = pipelineModel.transform(df_test)

display(predictions.select("total_amount", "prediction", *featuresCols))

total_amount,prediction,ride_id_0,passenger_count,trip_distance,rate_code_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount
11.0,11.887359619140623,2671470621003,1,2.5999999046325684,1,1,8.5,0.0,0.5,2.0,0.0
6.300000190734863,6.321783542633057,523986452745,6,1.090000033378601,1,2,5.300000190734863,0.5,0.5,0.0,0.0
9.0,9.285104751586914,197570240624,1,1.2999999523162842,1,1,7.0,0.0,0.5,1.5,0.0
7.599999904632568,7.6643218994140625,2671470622219,2,1.1100000143051147,1,2,6.099999904632568,1.0,0.5,0.0,0.0
8.199999809265137,8.239815711975098,2671470622496,1,2.0899999141693115,1,2,7.699999809265137,0.0,0.5,0.0,0.0
16.8700008392334,17.905780792236328,197570241479,1,3.0999999046325684,1,1,13.0,0.0,0.5,3.369999885559082,0.0
12.800000190734863,12.222943305969238,523986453649,1,2.6600000858306885,1,1,11.300000190734863,0.5,0.5,0.5,0.0
4.599999904632568,5.017699241638184,3590594500655,1,0.5099999904632568,1,2,4.099999904632568,0.0,0.5,0.0,0.0
16.200000762939453,16.293556213378906,523986454030,1,3.0,1,1,12.5,0.5,0.5,2.700000047683716,0.0
8.779999732971191,8.275099754333496,523986454283,1,1.1100000143051147,1,1,6.900000095367432,0.0,0.5,1.3799999952316284,0.0


## Evaluate root mean squared error (`rmse`)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol=xgb_regressor.getLabelCol(),
                                predictionCol=xgb_regressor.getPredictionCol())
rmse = evaluator.evaluate(predictions)

print("RMSE on our test set: %g" % rmse)

RMSE on our test set: 163.828


## Save and reload the model

In [0]:
%sh
rm -rf /dbfs/tmp/xgboost/pipeline_001
rm -rf /dbfs/tmp/xgboost/pipelineModel_001

In [0]:
# Save the pipeline that created the model
pipeline.save('/tmp/xgboost/pipeline_001')

# Save the model itself
pipelineModel.save('/tmp/xgboost/pipelineModel_001')

# Load the pipeline
loaded_pipeline = Pipeline.load('/tmp/xgboost/pipeline_001')

## Predict from loaded pipeline

In [0]:
# Load and use the model
from pyspark.ml import PipelineModel

loaded_pipelineModel = PipelineModel.load('/tmp/xgboost/pipelineModel_001')

# To represent new data, use the first 3 rows of the test dataset
new_data = df_test.limit(3)

# Make predictions with the loaded model
new_preds = loaded_pipelineModel.transform(new_data)
display(new_preds.select("total_amount", "prediction", *featuresCols))

total_amount,prediction,ride_id_0,passenger_count,trip_distance,rate_code_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount
8.0,8.363066673278809,2001456646806,1,0.8999999761581421,1,1,7.0,0.5,0.5,0.0,0.0
14.399999618530272,14.732670783996582,2001456647378,1,2.5,1,1,11.0,0.5,0.5,2.400000095367432,0.0
27.600000381469727,27.33540153503418,2001456648018,1,5.699999809265137,1,1,22.0,0.5,0.5,4.599999904632568,0.0
