# Set Up

In [None]:
# !pip install pyspark==3.3.1

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import round
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [None]:
# Create SparkSession object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('Flight Data Analysis') \
                    .getOrCreate()

# Read Data

In [None]:
# Read data from CSV file
flights = spark.read.csv('flights.csv', sep=',', header=True, inferSchema=True, nullValue='NA')

In [None]:
# Get number of records
print("The data contain %d records." % flights.count())

The data contain 50000 records.


In [None]:
# View the first five records
flights.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



In [None]:
flights.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: integer (nullable = true)



In [None]:
flights.describe().show()

+-------+-----------------+-----------------+-----------------+-------+------------------+-----+----------------+------------------+-----------------+------------------+
|summary|              mon|              dom|              dow|carrier|            flight|  org|            mile|            depart|         duration|             delay|
+-------+-----------------+-----------------+-----------------+-------+------------------+-----+----------------+------------------+-----------------+------------------+
|  count|            50000|            50000|            50000|  50000|             50000|50000|           50000|             50000|            50000|             47022|
|   mean|           5.2351|         15.66196|          2.95236|   null|        2054.31344| null|       882.40112|14.130952600000064|        151.76582|28.663795670111863|
| stddev|3.437758623534696|8.772488135606777|1.966033503314405|   null|2182.4715300582875| null|701.232785607705| 4.694052286573998|87.04507290261697|

# Clean Data

In [None]:
# Remove the 'flight' column
flights = flights.drop('flight')

In [None]:
# Remove records with missing 'delay' values
flights = flights.filter('delay IS NOT NULL')

In [None]:
# Remove records with missing values in any column
flights = flights.dropna()
print("After cleaning, the data contain %d records." % flights.count())

After cleaning, the data contain 47022 records.


# Pipeline

In [None]:
# Step 0: Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')

In [None]:
# Step 1: Indexing the 'org' column to convert categorical data to numeric indices
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

In [None]:
# Step 2: One-hot encoding for 'org_idx' and 'dow' columns to create dummy variables
onehot = OneHotEncoder(
    inputCols=['org_idx', 'dow'],
    outputCols=['org_dummy', 'dow_dummy']
)

In [None]:
# Step 3: Assembling all feature columns ('km', 'org_dummy', 'dow_dummy') into a single vector column
assembler = VectorAssembler(
    inputCols=['km', 'org_dummy', 'dow_dummy'],
    outputCol='features'
)

In [None]:
# Step 4: Defining the linear regression model to predict 'duration'
regression = LinearRegression(labelCol='duration')

In [None]:
# Step 5: Creating a pipeline with all the above stages
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

In [None]:
# Step 6: Training the pipeline on the training data split
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(flights_train)

In [None]:
# Step 7: Using the trained model to make predictions on the test data
predictions = model.transform(flights_test)

In [None]:
# Step 8: Evaluating the model's performance
evaluator = RegressionEvaluator(labelCol='duration', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")

Root Mean Squared Error (RMSE) on test data = 10.958803623058985


In [None]:
# Step 9: Setting up a parameter grid for model tuning
paramGrid = ParamGridBuilder() \
    .addGrid(regression.regParam, [0.01, 0.1, 1.0, 10.0]) \
    .addGrid(regression.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

In [None]:
# Step 10: Cross-validator to find the best model parameters
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

In [None]:
# Step 11: Running cross-validation, and choosing the best model parameters
cvModel = cv.fit(flights_train)

In [None]:
# Step 12: Examining the best model's performance and parameters
bestModel = cvModel.bestModel
predictions = bestModel.transform(flights_test)
bestRmse = evaluator.evaluate(predictions)
print(f"Best RMSE on test data = {bestRmse}")
print(bestModel.stages[-1].extractParamMap())

Best RMSE on test data = 10.958772686128164
{Param(parent='LinearRegression_edee7ebcdf87', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2, Param(parent='LinearRegression_edee7ebcdf87', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LinearRegression_edee7ebcdf87', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber'): 1.35, Param(parent='LinearRegression_edee7ebcdf87', name='featuresCol', doc='features column name.'): 'features', Param(parent='LinearRegression_edee7ebcdf87', name='fitIntercept', doc='whether to fit an intercept term.'): True, Param(parent='LinearRegression_edee7ebcdf87', name='labelCol', doc='label column name.'): 'duration', Param(parent='LinearRegression_edee7ebcdf87', name='loss', doc='The loss function to be optimized. Supported optio