# Set Up

In [None]:
#!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import round
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Bucketizer
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# Create SparkSession object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('Flight Data Analysis') \
                    .getOrCreate()

# Read Data

In [None]:
# Read data from CSV file
flights = spark.read.csv('flights.csv', sep=',', header=True, inferSchema=True, nullValue='NA')

In [None]:
# Get number of records
print("The data contain %d records." % flights.count())

The data contain 50000 records.


In [None]:
# View the first five records
flights.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| NULL|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| NULL|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



In [None]:
flights.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: integer (nullable = true)



In [None]:
flights.describe().show()

+-------+-----------------+-----------------+-----------------+-------+------------------+-----+----------------+------------------+-----------------+------------------+
|summary|              mon|              dom|              dow|carrier|            flight|  org|            mile|            depart|         duration|             delay|
+-------+-----------------+-----------------+-----------------+-------+------------------+-----+----------------+------------------+-----------------+------------------+
|  count|            50000|            50000|            50000|  50000|             50000|50000|           50000|             50000|            50000|             47022|
|   mean|           5.2351|         15.66196|          2.95236|   NULL|        2054.31344| NULL|       882.40112|14.130952600000064|        151.76582|28.663795670111863|
| stddev|3.437758623534696|8.772488135606777|1.966033503314405|   NULL|2182.4715300582875| NULL|701.232785607705| 4.694052286573998|87.04507290261697|

# Clean Data

In [None]:
# Remove the 'flight' column
flights = flights.drop('flight')

In [None]:
# Remove records with missing 'delay' values
flights = flights.filter('delay IS NOT NULL')

In [None]:
# Remove records with missing values in any column
flights = flights.dropna()
print("After cleaning, the data contain %d records." % flights.count())

After cleaning, the data contain 47022 records.


# Feature Engineering

In [None]:
# 0. Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')

In [None]:
# 1. Index categorical columns 'carrier' and 'org'
indexer_carrier = StringIndexer(inputCol='carrier', outputCol='carrier_idx')
model_carrier = indexer_carrier.fit(flights)
flights = model_carrier.transform(flights)

indexer_org = StringIndexer(inputCol='org', outputCol='org_idx')
model_org = indexer_org.fit(flights)
flights = model_org.transform(flights)

In [None]:
# 2. One-hot encoding for 'org'
encoder_org = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy'])
flights = encoder_org.fit(flights).transform(flights)

In [None]:
# 3. Bucketize 'depart' into 3-hour intervals
bucketizer = Bucketizer(splits=list(range(0, 25, 3)), inputCol='depart', outputCol='depart_bucket')
flights = bucketizer.transform(flights)

In [None]:
# 4. One-hot encoding for 'depart_bucket'
encoder_depart = OneHotEncoder(inputCols=['depart_bucket'], outputCols=['depart_dummy'])
flights = encoder_depart.fit(flights).transform(flights)

In [None]:
# 5. One-hot encode 'dow' (day of week) and 'mon' (month)
encoder_dow = OneHotEncoder(inputCols=['dow'], outputCols=['dow_dummy'])
flights = encoder_dow.fit(flights).transform(flights)

encoder_mon = OneHotEncoder(inputCols=['mon'], outputCols=['mon_dummy'])
flights = encoder_mon.fit(flights).transform(flights)

In [None]:
# 6. Assemble features into a single vector column
assembler = VectorAssembler(inputCols=['km', 'org_dummy', 'depart_dummy', 'dow_dummy', 'mon_dummy'], outputCol='features')
flights = assembler.transform(flights)

# Build Linear Regression Model

In [None]:
# Split the data
train_data, test_data = flights.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Build the model
regression = LinearRegression(featuresCol='features', labelCol='duration')
model = regression.fit(train_data)

In [None]:
# Make predictions
predictions = model.transform(test_data)

# Evaluate Model

In [None]:
# Evaluate the model
evaluator = RegressionEvaluator(labelCol='duration', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print("Root Mean Square Error (RMSE) on test data =", rmse)

# Print coefficients and intercept for interpretation
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

Root Mean Square Error (RMSE) on test data = 10.554263427234297
Coefficients: [0.07444147376085378,27.476492289083048,20.31885115059569,51.823424122389596,45.89694430098321,15.295887358001956,17.71215150132268,17.35511231496498,-15.117129298334262,1.5041021032109734,3.9491015790392745,6.891971690056981,4.563688708470956,8.720145274986374,8.657433176655813,0.46418453532851983,0.25366681707506505,-0.06993764706617553,0.353176161751262,0.5382243945078051,0.27174673946852906,-2.0069758486591036,-2.3183604639142095,-2.196580694340696,-3.5981266246763735,-4.463505870693827,-4.3785957245265354,-4.693367973095902,-4.5338012134263135,-4.051231426350411,-3.0499862274969116,-1.1243430652560114]
Intercept: 13.037680850267934


# Enhancing the model with Lasso regularization

In [None]:
# Build the model with Lasso Regularization
lasso_regression = LinearRegression(featuresCol='features', labelCol='duration', regParam=0.1, elasticNetParam=1)
lasso_model = lasso_regression.fit(train_data)

In [None]:
# Make predictions with the Lasso model
lasso_predictions = lasso_model.transform(test_data)

In [None]:
# Evaluate the Lasso model
lasso_rmse = evaluator.evaluate(lasso_predictions)
print("Lasso RMSE on test data =", lasso_rmse)

Lasso RMSE on test data = 10.71063539020168


In [None]:
# Print Lasso coefficients and intercept for interpretation
print("Lasso Coefficients:", lasso_model.coefficients)
print("Lasso Intercept:", lasso_model.intercept)

Lasso Coefficients: [0.07426817733029681,17.184711091156284,9.83153244819494,41.42335140494034,35.36935057230134,4.621911085273369,7.062042940049097,6.487205861832949,-13.022688634100811,0.0,1.5499011765161845,4.486505196250478,1.9266849624651667,6.190241452445093,6.163164451019345,0.0,0.0,-0.04845823132271837,0.0,0.0,0.0,0.0,0.0,0.0,-1.1078550426425793,-1.9437459232211398,-1.9283390637615994,-2.269419616922474,-2.0663779889838327,-1.4876332588464753,-0.5561872969405196,0.6767355902439556]
Lasso Intercept: 23.938784192625178
