# Generating Models for Each Airport Destination 

---

We tune the hyperparameters for the GBTRegressor

For a more comprehsneive tuning process, we utilize k fold cross validation, with various tree sizes and maximum depths

---

## Load Spark & Data

In [0]:
# %pip install xgboost 

In [0]:
# %pip install hyperopt

In [0]:
dbutils.library.restartPython()

In [0]:

# from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, SparkTrials
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from xgboost.spark import SparkXGBRegressor
from pyspark.sql import SparkSession

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
from pyspark.ml.linalg import VectorUDT

In [0]:
spark = SparkSession.builder \
.appName("xgboost_large") \
.config("spark.executor.memory", "8g") \
.config("spark.driver.memory", "4g") \
.config("spark.executor.cores", "2") \
.config("spark.executor.instances", "4") \
.getOrCreate()

In [0]:
# REPLACE WITH PROCESSED DATA FILEPATH
DATA_PATH = "/mnt/nguyen1/itineraries_processed_0.1.parquet"

In [0]:
# display(dbutils.fs.ls("/mnt/nguyen1/itineraries_processed_0.1.parquet/"))

In [0]:
# display(df)

In [0]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [0]:
train_data.count()

6029346

In [0]:
test_data.count()

1508468

## Create Vector Assembler

In [0]:
feature_columns = df.columns[:-1]
feature_columns.remove('totalFare')

# Assemble features into a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
# df_ass = assembler.transform(df)

## XGboost Tuning

In [0]:
xgb_regressor = SparkXGBRegressor(
    features_col="features", 
    label_col="totalFare", 
    prediction_col="prediction",
    objective="reg:squarederror",
)

# Define the pipeline with stages
pipeline = Pipeline(stages=[assembler, xgb_regressor])


paramGrid = ParamGridBuilder() \
    .addGrid(xgb_regressor.max_depth, [3, 5, 7, 9, 10]) \
    .addGrid(xgb_regressor.n_estimators, [10, 15, 20, 40, 30]) \
    .build()

crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=RegressionEvaluator(
        labelCol="totalFare",
        predictionCol="prediction",
        metricName="rmse"
    ),
    numFolds=3
)

# Fit the model
cv_model = crossval.fit(train_data)

# Make predictions
prediction = cv_model.transform(test_data)
# prediction.show()


2024-04-12 22:10:05,539 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'device': 'cpu', 'max_depth': 3, 'objective': 'reg:squarederror', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 10}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-04-12 22:11:03,787 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-04-12 22:11:51,706 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'device': 'cpu', 'max_depth': 3, 'objective': 'reg:squarederror', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 15}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-04-12 22:12:26,944 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-04-12 22:12:34,662 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'device': 'cpu', 'max_depth': 3, 'objective': 'reg:squarederror', 'nthread': 1}
	train_call_kwargs_params: {'verbos

In [0]:
# Define evaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="totalFare", metricName="rmse")

# Evaluate the best model
rmse = evaluator.evaluate(prediction)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator.setMetricName("mae")
mae = evaluator.evaluate(prediction)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

Root Mean Squared Error (RMSE) on test data = 106.358
Mean Absolute Error (MAE) on test data = 64.2874


In [0]:
# Assuming 'cvModel' is the fitted CrossValidator model
avg_metrics = cv_model.avgMetrics  # This holds the average RMSE for each combination

# Iterate over each combination of parameters and their corresponding metric
for i, params in enumerate(paramGrid):
    # Extract the specific parameters
    max_depth = params[xgb_regressor.max_depth]
    num_round = params[xgb_regressor.n_estimators]
    rmse = avg_metrics[i]  # Assuming RMSE is the metric used in CrossValidator setup

    # Print each combination with its RMSE
    print(f"Max Depth: {max_depth}, Num Rounds: {num_round}, RMSE: {rmse}")


Max Depth: 3, Num Rounds: 10, RMSE: 137.9636213333039
Max Depth: 3, Num Rounds: 15, RMSE: 133.5397413595651
Max Depth: 3, Num Rounds: 20, RMSE: 131.87884605262445
Max Depth: 3, Num Rounds: 40, RMSE: 128.3961015423497
Max Depth: 3, Num Rounds: 30, RMSE: 129.82932742041135
Max Depth: 5, Num Rounds: 10, RMSE: 131.49060953939204
Max Depth: 5, Num Rounds: 15, RMSE: 128.14050110122835
Max Depth: 5, Num Rounds: 20, RMSE: 126.26344183465767
Max Depth: 5, Num Rounds: 40, RMSE: 122.01823033996921
Max Depth: 5, Num Rounds: 30, RMSE: 123.65954390702053
Max Depth: 7, Num Rounds: 10, RMSE: 126.19766141715927
Max Depth: 7, Num Rounds: 15, RMSE: 122.16652700354892
Max Depth: 7, Num Rounds: 20, RMSE: 120.3240025228376
Max Depth: 7, Num Rounds: 40, RMSE: 115.66238708143037
Max Depth: 7, Num Rounds: 30, RMSE: 117.7248216610373
Max Depth: 9, Num Rounds: 10, RMSE: 120.52302556843563
Max Depth: 9, Num Rounds: 15, RMSE: 116.374211620966
Max Depth: 9, Num Rounds: 20, RMSE: 114.31391543661759
Max Depth: 9, Num

In [0]:
# Save the model to a specified path
modelPath = "/mnt/nguyen1/xgb_model_param"
cv_model.save(modelPath)

In [0]:
# spark.stop()