# Generating Models for Each Airport Destination 

---

We tune the hyperparameters for the GBTRegressor

For a more comprehsneive tuning process, we utilize k fold cross validation, with various tree sizes and maximum depths

---

## Load Spark & Data

In [0]:
%pip install xgboost 

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting xgboost
  Using cached xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

In [0]:

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, SparkTrials
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from xgboost.spark import SparkXGBRegressor
from pyspark.sql import SparkSession

[0;31m---------------------------------------------------------------------------[0m
[0;31mModuleNotFoundError[0m                       Traceback (most recent call last)
File [0;32m<command-3535140287728786>, line 1[0m
[0;32m----> 1[0m [38;5;28;01mfrom[39;00m [38;5;21;01mhyperopt[39;00m [38;5;28;01mimport[39;00m fmin, tpe, hp, Trials, STATUS_OK, SparkTrials
[1;32m      2[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01mml[39;00m [38;5;28;01mimport[39;00m Pipeline
[1;32m      3[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01mml[39;00m[38;5;21;01m.[39;00m[38;5;21;01mfeature[39;00m [38;5;28;01mimport[39;00m VectorAssembler

[0;31mModuleNotFoundError[0m: No module named 'hyperopt'

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
from pyspark.ml.linalg import VectorUDT

In [0]:
spark = SparkSession.builder \
.appName("airport_models") \
.config("spark.executor.memory", "8g") \
.config("spark.driver.memory", "4g") \
.config("spark.executor.cores", "2") \
.config("spark.executor.instances", "4") \
.getOrCreate()

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-3535140287728788>, line 1[0m
[0;32m----> 1[0m spark [38;5;241m=[39m [43mSparkSession[49m[38;5;241m.[39mbuilder \
[1;32m      2[0m [38;5;241m.[39mappName([38;5;124m"[39m[38;5;124mairport_models[39m[38;5;124m"[39m) \
[1;32m      3[0m [38;5;241m.[39mconfig([38;5;124m"[39m[38;5;124mspark.executor.memory[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124m8g[39m[38;5;124m"[39m) \
[1;32m      4[0m [38;5;241m.[39mconfig([38;5;124m"[39m[38;5;124mspark.driver.memory[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124m4g[39m[38;5;124m"[39m) \
[1;32m      5[0m [38;5;241m.[39mconfig([38;5;124m"[39m[38;5;124mspark.executor.cores[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124m2[39m[38;5;124m"[39m) \
[1;32m      6[0m [38;5;241m.[39mconfig([38;5;124m"[39

In [0]:
# REPLACE WITH PROCESSED DATA FILEPATH
DATA_PATH = "/mnt/nguyen1/itineraries_processed_0.1.parquet"

In [0]:
df = spark.read.parquet('dbfs:/mnt/nguyen1/itineraries_processed_0.1.parquet')
df = df.sample(fraction=0.1, withReplacement=False, seed=42)

In [0]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

## Create Vector Assembler

In [0]:
feature_columns = df.columns[:-1]
feature_columns.remove('totalFare')

# Assemble features into a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
# df_ass = assembler.transform(df)

## XGBoost

In [0]:
xgbRegressor = SparkXGBRegressor(
    features_col="features", 
    label_col="totalFare", 
    prediction_col="prediction",
    objective="reg:squarederror",
    numRound=4
    #numWorkers=3  # Adjust based on your Spark setup
)

# Define the pipeline with stages
pipeline = Pipeline(stages=[assembler, xgbRegressor])

# Define evaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="totalFare", metricName="rmse")

# Fit the pipeline to the training data
pipeline_model = pipeline.fit(train_data)




INFO:XGBoost-PySpark:Running xgboost-2.0.3 on 1 workers with
	booster params: {'objective': 'reg:squarederror', 'device': 'cpu', 'numRound': 4, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!


In [0]:
# Make predictions on the test set
predictions = pipeline_model.transform(test_data)

# Evaluate the best model
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)





Root Mean Squared Error (RMSE) on test data = 113.306
Root Mean Squared Error (RMSE) on test data = 113.306


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-860581649654263>, line 15[0m
[1;32m     11[0m     [38;5;28;01mreturn[39;00m predictions[38;5;241m.[39mwithColumn([38;5;124m"[39m[38;5;124mabsolute_error[39m[38;5;124m"[39m, [38;5;28mabs[39m(col(labelCol) [38;5;241m-[39m col(predictionCol)))
[1;32m     14[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124mRoot Mean Squared Error (RMSE) on test data = [39m[38;5;132;01m%g[39;00m[38;5;124m"[39m [38;5;241m%[39m rmse)
[0;32m---> 15[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124mMean Absolute Error (MAE) on test data = [39m[38;5;132;01m%g[39;00m[38;5;124m"[39m [38;5;241m%[39m mae)

[0;31mNameError[0m: name 'mae' is not defined

In [0]:
evaluator.setMetricName("mae")
mae = evaluator.evaluate(predictions)
print("Mean Absolute Error (MAE) on test data = %g" % mae)


Mean Absolute Error (MAE) on test data = 70.0779


## XGboost Tuning

In [0]:
xgb_regressor = SparkXGBRegressor(
    features_col="features", 
    label_col="totalFare", 
    prediction_col="prediction",
    objective="reg:squarederror",
)

# Define the pipeline with stages
pipeline = Pipeline(stages=[assembler, xgb_regressor])


paramGrid = ParamGridBuilder() \
    .addGrid(xgb_regressor.max_depth, [3, 5, 7, 9, 10]) \
    .addGrid(xgb_regressor.n_estimators, [10, 15, 20, 40, 30]) \
    .build()

crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=RegressionEvaluator(
        labelCol="totalFare",
        predictionCol="prediction",
        metricName="rmse"
    ),
    numFolds=3
)

# Fit the model
cv_model = crossval.fit(train_data)

# Make predictions
prediction = cv_model.transform(test_data)
prediction.show()


2024-04-12 06:43:20,789 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'device': 'cpu', 'max_depth': 3, 'objective': 'reg:squarederror', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 10}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-04-12 06:43:52,845 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-04-12 06:44:29,498 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'device': 'cpu', 'max_depth': 3, 'objective': 'reg:squarederror', 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 15}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-04-12 06:45:05,805 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-04-12 06:45:12,925 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 1 workers with
	booster params: {'device': 'cpu', 'max_depth': 3, 'objective': 'reg:squarederror', 'nthread': 1}
	train_call_kwargs_params: {'verbos

+--------------+------------+---------+---------+------------------+---+-----------------------+--------------------------+--------+--------+--------------------+---------+--------------+-----------------+------------+---------------+--------------------+------------------+
|isBasicEconomy|isRefundable|isNonStop|totalFare|days_before_flight|day|startingAirport_encoded|destinationAirport_encoded|num_legs|All_Same|airline_name_encoded| distance|departure_hour|departure_dow_idx|starting_pop|destination_pop|            features|        prediction|
+--------------+------------+---------+---------+------------------+---+-----------------------+--------------------------+--------+--------+--------------------+---------+--------------+-----------------+------------+---------------+--------------------+------------------+
|             0|           0|        0|    43.08|                16|  3|         (15,[6],[1.0])|            (15,[8],[1.0])|       2|       1|      (13,[4],[1.0])| 1049.121|   

In [0]:
# Define evaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="totalFare", metricName="rmse")

# Evaluate the best model
rmse = evaluator.evaluate(prediction)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator.setMetricName("mae")
mae = evaluator.evaluate(prediction)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

Root Mean Squared Error (RMSE) on test data = 120.632
Mean Absolute Error (MAE) on test data = 76.205


In [0]:
# Get the best model
best_model = cv_model.bestModel

# Access the stages of the pipeline
stages = best_model.stages

# Access the parameters of the RandomForestRegressor stage
rf_params = stages[-1].extractParamMap()

# Print the parameters
print("Best Model Parameters:")
for param, value in rf_params.items():
    print(param.name, ":", value)

Best Model Parameters:
enable_sparse_data_optim : False
featuresCol : features
features_cols : []
labelCol : totalFare
predictionCol : prediction
arbitrary_params_dict : {}
base_score : None
booster : None
callbacks : None
colsample_bylevel : None
colsample_bynode : None
colsample_bytree : None
device : cpu
early_stopping_rounds : None
eval_metric : None
feature_names : None
feature_types : None
feature_weights : None
force_repartition : False
gamma : None
grow_policy : None
importance_type : None
interaction_constraints : None
iteration_range : None
learning_rate : None
max_bin : None
max_cat_threshold : None
max_cat_to_onehot : None
max_delta_step : None
max_depth : 7
max_leaves : None
min_child_weight : None
missing : nan
monotone_constraints : None
multi_strategy : None
n_estimators : 20
num_parallel_tree : None
num_workers : 1
objective : reg:squarederror
random_state : None
reg_alpha : None
reg_lambda : None
repartition_random_shuffle : False
sampling_method : None
scale_pos_weig

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

# Create an evaluator for RMSE and MAE
evaluatorRMSE = RegressionEvaluator(
    labelCol="totalFare",
    predictionCol="prediction",
    metricName="rmse"
)

evaluatorMAE = RegressionEvaluator(
    labelCol="totalFare",
    predictionCol="prediction",
    metricName="mae"
)

# Evaluate the best model on a test set if you have separated one
# Assuming 'testData' is available
predictions = best_model.transform(test_data)

# Calculate RMSE and MAE
rmse = evaluatorRMSE.evaluate(predictions)
mae = evaluatorMAE.evaluate(predictions)
print("RMSE on test data:", rmse)
print("MAE on test data:", mae)



RMSE on test data: 120.63204640429475
MAE on test data: 76.20495934636185


In [0]:
# Assuming 'cvModel' is the fitted CrossValidator model
avg_metrics = cv_model.avgMetrics  # This holds the average RMSE for each combination

# Iterate over each combination of parameters and their corresponding metric
for i, params in enumerate(paramGrid):
    # Extract the specific parameters
    max_depth = params[xgb_regressor.max_depth]
    num_round = params[xgb_regressor.n_estimators]
    rmse = avg_metrics[i]  # Assuming RMSE is the metric used in CrossValidator setup

    # Print each combination with its RMSE
    print(f"Max Depth: {max_depth}, Num Rounds: {num_round}, RMSE: {rmse}")


Max Depth: 3, Num Rounds: 10, RMSE: 137.97848986734505
Max Depth: 3, Num Rounds: 15, RMSE: 134.32959671717995
Max Depth: 3, Num Rounds: 20, RMSE: 132.60990374977555
Max Depth: 5, Num Rounds: 10, RMSE: 131.45368530305385
Max Depth: 5, Num Rounds: 15, RMSE: 128.41338851141504
Max Depth: 5, Num Rounds: 20, RMSE: 126.58390977948856
Max Depth: 7, Num Rounds: 10, RMSE: 126.06244793316499
Max Depth: 7, Num Rounds: 15, RMSE: 122.84104203360154
Max Depth: 7, Num Rounds: 20, RMSE: 120.57143460060239


In [0]:
# Save the model to a specified path
modelPath = "/mnt/nguyen1/xgb_model_small"
cv_model.save(modelPath)

In [0]:
# spark.stop()