In [0]:
# df_ride_info = spark.read \
#   .parquet("s3://dsoaws/nyc-taxi-orig-cleaned-split-parquet-per-year-multiple-files/ride-info/") 

# df_ride_fare = spark.read \
#   .parquet("s3://dsoaws/nyc-taxi-orig-cleaned-split-parquet-per-year-multiple-files/ride-fare/")

# df_model_training = df_ride_info.join(df_ride_fare, on="ride_id") \
#                                 .drop(df_ride_fare.ride_id) \
#                                 .drop(df_ride_fare.year)

# df_model_training = df_model_training \
#   .drop("ride_id") \
#   .drop("pickup_at") \
#   .drop("dropoff_at") \
#   .drop("store_and_fwd_flag")

# df_train = df_model_training # for now, we keep them the same as we want all 1 billion rows to be used for training
# df_test = df_model_training # for now, we keep them the same as we are not actually comparing RMSE between the models

# df_train, df_test = df_model_training.randomSplit([0.70, 0.30], seed = 0)

# df_train = spark.read.option("recursiveFileLookup", "true").parquet('s3://dsoaws/gsml-nyc-taxi-full-etl-ml-test-4-custompyspark-export-s3-via-notebook/export-flow-2023-03-02-03-32-10-53926e35/output/training/')

# df_test = spark.read.option("recursiveFileLookup", "true").parquet('s3://dsoaws/gsml-nyc-taxi-full-etl-ml-test-4-custompyspark-export-s3-via-notebook/export-flow-2023-03-02-03-32-10-53926e35/output/validation/')

# print("There are %d training and %d test examples." % (df_train.count(), df_test.count()))  

In [0]:
df_train = spark.read \
  .parquet('s3://dsoaws/nyc-taxi-orig-cleaned-dropped-parquet-per-year-multiple-files/')

In [0]:
# adjust for too much data above ^^
# df_train, _ = df_train.randomSplit([0.70, 0.30], seed = 0)

# _, df_test = df_test.randomSplit([0.70, 0.30], seed = 0)

# print("There are %d training and %d test examples." % (df_train.count(), df_test.count()))  

In [0]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline
import xgboost
from xgboost.spark import SparkXGBRegressor

featuresCols = df_train.columns
featuresCols.remove('total_amount')

vectorAssembler = VectorAssembler(inputCols=featuresCols, 
                                  outputCol="rawFeatures", 
                                  handleInvalid="skip")

vectorIndexer = VectorIndexer(inputCol="rawFeatures", 
                              outputCol="features", 
                              maxCategories=100, 
                              handleInvalid="skip")

xgb_regressor = SparkXGBRegressor(num_workers=480, # adjust this
                                  label_col="total_amount", 
                                  missing=0.0,
                                  eta=0.2,
                                  gamma=4,
                                  max_depth=5,
                                  min_child_weight=6,
                                  num_round=50,
                                  objective='reg:squarederror',
                                  subsample=0.7)

pipeline = Pipeline(stages=[vectorAssembler, vectorIndexer, xgb_regressor])      

# m5.24xlarge (6 incl leader node) - 4 minutes
pipelineModel = pipeline.fit(df_train)

In [0]:
featuresCols

Out[16]: ['vendor_id',
 'passenger_count',
 'trip_distance',
 'rate_code_id',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'year']

## Predict

In [0]:
# predictions = pipelineModel.transform(df_test)

# display(predictions.select("total_amount", "prediction", *featuresCols))

total_amount,prediction,vendor_id,passenger_count,trip_distance,rate_code_id,year,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount
6.4,6.244748115539551,1,2,0.7,1,2012,2,4.9,1.0,0.5,0.0,0.0
6.2,6.185119152069092,1,1,1.2,1,2012,2,5.7,0.0,0.5,0.0,0.0
12.8,12.315897941589355,1,2,3.3,1,2012,2,11.3,1.0,0.5,0.0,0.0
7.8,7.7153778076171875,1,1,1.7,1,2012,2,7.3,0.0,0.5,0.0,0.0
6.2,6.185119152069092,1,1,1.1,1,2012,2,5.7,0.0,0.5,0.0,0.0
7.4,7.245301246643066,1,1,2.1,1,2012,2,6.9,0.0,0.5,0.0,0.0
5.4,5.487746238708496,1,1,0.4,1,2012,2,4.9,0.0,0.5,0.0,0.0
5.1,5.319957256317139,1,2,0.3,1,2012,2,4.1,0.5,0.5,0.0,0.0
13.9,13.58199691772461,1,1,3.7,1,2012,2,12.9,0.5,0.5,0.0,0.0
12.3,11.815232276916504,1,1,3.8,1,2012,2,11.3,0.5,0.5,0.0,0.0


## Evaluate root mean squared error (`rmse`)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol=xgb_regressor.getLabelCol(),
                                predictionCol=xgb_regressor.getPredictionCol())
rmse = evaluator.evaluate(predictions)

print("RMSE on our test set: %g" % rmse)

RMSE on our test set: 163.508


## Save and reload the model

In [0]:
%sh
rm -rf /dbfs/tmp/xgboost/pipeline_001
rm -rf /dbfs/tmp/xgboost/pipelineModel_001

In [0]:
# Save the pipeline that created the model
pipeline.save('/tmp/xgboost/pipeline_001')

# Save the model itself
pipelineModel.save('/tmp/xgboost/pipelineModel_001')

# Load the pipeline
loaded_pipeline = Pipeline.load('/tmp/xgboost/pipeline_001')

## Predict from loaded pipeline

In [0]:
# # Load and use the model
# from pyspark.ml import PipelineModel

# loaded_pipelineModel = PipelineModel.load('/tmp/xgboost/pipelineModel_001')

# # To represent new data, use the first 3 rows of the test dataset
# new_data = df_test.limit(3)

# # Make predictions with the loaded model
# new_preds = loaded_pipelineModel.transform(new_data)
# display(new_preds.select("total_amount", "prediction", *featuresCols))

total_amount,prediction,vendor_id,passenger_count,trip_distance,rate_code_id,year,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount
6.8,6.8296074867248535,1,1,1.1,1,2012,1,5.3,0.0,0.5,1.0,0.0
8.04,7.893260955810547,1,1,1.5,1,2012,1,5.7,0.5,0.5,1.34,0.0
21.9,21.11377716064453,1,2,6.2,1,2012,2,20.9,0.5,0.5,0.0,0.0
