# Hyperparameter Tuning

---

We tune the hyperparameters for the GBTRegressor

For a more comprehsneive tuning process, we utilize k fold cross validation, with various tree sizes and maximum depths

---

## Load Spark & Data

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder \
.appName("gbt_tune") \
.config("spark.executor.memory", "8g") \
.config("spark.driver.memory", "4g") \
.config("spark.executor.cores", "2") \
.config("spark.executor.instances", "4") \
.getOrCreate()

In [0]:
# display(dbutils.fs.ls("/mnt/nguyen1/starting_paths/startingAirport=ATL"))

In [0]:
# REPLACE WITH PROCESSED DATA FILEPATH
DATA_PATH = "/mnt/nguyen1/starting_paths/startingAirport=ATL"

In [0]:
df = spark.read.parquet(DATA_PATH)

In [0]:
df.show()

+--------------+------------+---------+---------+------------------+---+-----------------------+--------------------------+--------+--------+--------------------+--------------+-----------------+------------+---------------+
|isBasicEconomy|isRefundable|isNonStop|totalFare|days_before_flight|day|startingAirport_encoded|destinationAirport_encoded|num_legs|All_Same|airline_name_encoded|departure_hour|departure_dow_idx|starting_pop|destination_pop|
+--------------+------------+---------+---------+------------------+---+-----------------------+--------------------------+--------+--------+--------------------+--------------+-----------------+------------+---------------+
|             0|           0|        0|    696.6|                33| 22|         (15,[7],[1.0])|            (15,[3],[1.0])|       2|       1|      (13,[0],[1.0])|            10|              6.0|   24.780424|       13.12786|
|             0|           0|        1|    608.6|                33| 22|         (15,[7],[1.0])|    

In [0]:
df.show()

+--------------+------------+---------+---------+------------------+---+-----------------------+--------------------------+--------+--------+--------------------+---------+--------------+-----------------+------------+---------------+
|isBasicEconomy|isRefundable|isNonStop|totalFare|days_before_flight|day|startingAirport_encoded|destinationAirport_encoded|num_legs|All_Same|airline_name_encoded| distance|departure_hour|departure_dow_idx|starting_pop|destination_pop|
+--------------+------------+---------+---------+------------------+---+-----------------------+--------------------------+--------+--------+--------------------+---------+--------------+-----------------+------------+---------------+
|             0|           0|        0|    239.6|                35| 17|         (15,[1],[1.0])|            (15,[2],[1.0])|       2|       1|      (13,[0],[1.0])|2231.2368|            15|              4.0|       100.0|      30.146616|
|             0|           0|        0|    239.6|           

In [0]:
df.count()

5141232

In [0]:
df = df.drop(*["startingAirport", "destinationAirport"])

In [0]:
# train test split
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

## Create Vector Assembler

In [0]:
feature_columns = df.columns[:-1]
feature_columns.remove('totalFare')

# Assemble features into a vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
# df_ass = assembler.transform(df)

## Run Model

In [0]:
# Initialize the GBTRegressor
gbt = GBTRegressor(featuresCol="features", labelCol="totalFare")

# Define the pipeline with the stages
pipeline = Pipeline(stages=[assembler, gbt])

# Define evaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="totalFare", metricName="rmse")

# Create ParamGrid for Cross Validation
paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [2, 4  ]) \
    .addGrid(gbt.maxIter, [10, 20]) \
    .build()

# Create CrossValidator

In [0]:
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

# Run cross-validation
cvModel = cv.fit(train_data)

## Investigate the best performing model

In [0]:
# Get the best model
best_model = cvModel.bestModel

# Access the stages of the pipeline
stages = best_model.stages

# Access the parameters of the RandomForestRegressor stage
rf_params = stages[-1].extractParamMap()

# Print the parameters
print("Best Model Parameters:")
for param, value in rf_params.items():
    print(param.name, ":", value)

Best Model Parameters:
cacheNodeIds : False
checkpointInterval : 10
featureSubsetStrategy : all
featuresCol : features
impurity : variance
labelCol : totalFare
leafCol : 
lossType : squared
maxBins : 32
maxDepth : 4
maxIter : 20
maxMemoryInMB : 256
minInfoGain : 0.0
minInstancesPerNode : 1
minWeightFractionPerNode : 0.0
predictionCol : prediction
seed : -6682481135904123338
stepSize : 0.1
subsamplingRate : 1.0
validationTol : 0.01


## Evaluation

In [0]:
# Make predictions on test data using the best model
predictions = best_model.transform(test_data)

# Evaluate the model on test data
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data:", rmse)

Root Mean Squared Error (RMSE) on test data: 99.76132813694421


In [0]:
# Save the model to a specified path
modelPath = "/mnt/nguyen1/gbt_atl_model"
cvModel.save(modelPath)