In [6]:
!kaggle datasets download -d sherrytp/airline-delay-analysis -f "airline delay analysis/2018.csv"
!unzip 2018.csv.zip
!rm 2018.csv.zip

Downloading 2018.csv.zip to /workspace/pyspark-airline-delay-classification
 87%|███████████████████████████████████▋     | 207M/238M [00:00<00:00, 432MB/s]
100%|█████████████████████████████████████████| 238M/238M [00:00<00:00, 420MB/s]
Archive:  2018.csv.zip
  inflating: 2018.csv                


In [7]:
# initlize pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("airline-delay-regression").getOrCreate()

In [8]:
# Needed to make Jupyter work with Gitpod
import plotly.io as pio
pio.renderers.default = 'iframe_connected'

In [9]:
# Read the data into a dataframe and print the schema
df = spark.read.csv("2018.csv", header=True, inferSchema=True)
df.printSchema()



root
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_TIME: double (nullable = true)
 |-- DEP_DELAY: double (nullable = true)
 |-- TAXI_OUT: double (nullable = true)
 |-- WHEELS_OFF: double (nullable = true)
 |-- WHEELS_ON: double (nullable = true)
 |-- TAXI_IN: double (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- ARR_TIME: double (nullable = true)
 |-- ARR_DELAY: double (nullable = true)
 |-- CANCELLED: double (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: double (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- ACTUAL_ELAPSED_TIME: double (nullable = true)
 |-- AIR_TIME: double (nullable = true)
 |-- DISTANCE: double (nullable = true)
 |-- CARRIER_DELAY: double (nullable = true)
 |-- WEATHER_DELAY: do

                                                                                

In [56]:
# import matplotlib.pyplot as plt

# df.plot.scatter(x='FL_DATE', y='ARR_DELAY')

ValueError: Unable to parse datatype from schema. [Errno 111] Connection refused

In [10]:
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

df = df.drop("Unnamed: 27", "LATE_AIRCRAFT_DELAY", "SECURITY_DELAY", "NAS_DELAY", "WEATHER_DELAY", "CARRIER_DELAY", "AIR_TIME", "ACTUAL_ELAPSED_TIME", "DIVERTED", "CANCELLATION_CODE", "CANCELLED", "ARR_TIME", "TAXI_IN", "WHEELS_ON", "WHEELS_OFF", "TAXI_OUT", "DEP_TIME", "OP_CARRIER_FL_NUM", "ARR_DELAY")

df.printSchema()

@udf(returnType=IntegerType())
def get_month(date):
    return int(date.split("-")[1])

@udf(returnType=IntegerType())
def get_day(date):
    return int(date.split("-")[2])

@udf(returnType=IntegerType())
def get_year(date):
    return int(date.split("-")[0])

# Adds month and a day column to the dataframe
df = df.withColumn("YEAR", get_year(df["FL_DATE"]).cast(IntegerType()))
df = df.withColumn("MONTH", get_month(df["FL_DATE"]).cast(IntegerType()))
df = df.withColumn("DAY", get_day(df["FL_DATE"]).cast(IntegerType()))

df = df.drop("FL_DATE")

df = df.dropna()

df.printSchema()

# Print the first 5 rows of the dataframe
df.show(5)
print(df.count())

root
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_DELAY: double (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- DISTANCE: double (nullable = true)

root
 |-- OP_CARRIER: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_DELAY: double (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- CRS_ELAPSED_TIME: double (nullable = true)
 |-- DISTANCE: double (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)

+----------+------+----+------------+---------+------------+----------------+--------+----+-----+---+
|OP_CARRIER|ORIGIN|DEST|CRS_DEP_TIME|DEP_DELAY|CRS_ARR_TIME|CRS_EL



7096202


                                                                                

In [16]:
# Preparing the data
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder

train, test = df.randomSplit([0.8, 0.2])

# Encode the categorical features using StringIndexer
indexer = StringIndexer(inputCols=["OP_CARRIER", "ORIGIN", "DEST", "DAY", "MONTH"], outputCols=["OP_CARRIER_INDEX", "ORIGIN_INDEX", "DEST_INDEX", "DAY_INDEX", "MONTH_INDEX"])

# Use one hot encoding to encode the categorical features
encoder = OneHotEncoder(inputCols=["OP_CARRIER_INDEX", "ORIGIN_INDEX", "DEST_INDEX", "DAY_INDEX", "MONTH_INDEX"], outputCols=["OP_CARRIER_VEC", "ORIGIN_VEC", "DEST_VEC", "DAY_VEC", "MONTH_VEC"])

# Create the assembler
assembler = VectorAssembler(inputCols=["OP_CARRIER_VEC", "ORIGIN_VEC", "DEST_VEC", "CRS_DEP_TIME", "CRS_ARR_TIME", "DAY_VEC", "MONTH_VEC", "YEAR"], outputCol="features")

# Get the maximum number of all categorical features in the dataframe
num_of_origins = df.select("ORIGIN").distinct().count()
num_of_destinations = df.select("DEST").distinct().count()
num_of_carriers = df.select("OP_CARRIER").distinct().count()
max_num_of_categorical_features = max(num_of_origins, num_of_destinations, num_of_carriers)
print("max categories:", max_num_of_categorical_features)



max categories: 358


                                                                                

# First Model: Linear Regression

In [48]:
# Linear Regression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator


lr = LinearRegression(featuresCol="features", labelCol="DEP_DELAY", regParam=0.3)

# Create the pipeline
pipeline = Pipeline(stages=[indexer, encoder, assembler, lr])

# Train the model
model = pipeline.fit(train)

# Evaluate the model
predictions = model.transform(test)

# Print the RMSE
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="DEP_DELAY", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE:", rmse)



RMSE: 44.274917223028034


                                                                                

In [44]:
print("Mean:", test.select("DEP_DELAY").agg({"DEP_DELAY": "mean"}).collect()[0][0])
print("Std:", test.select("DEP_DELAY").agg({"DEP_DELAY": "stddev"}).collect()[0][0])

                                                                                

Mean: 9.93221958518358




Std: 44.739548581249586


                                                                                

# Second Model: Decision Tree Regressor

In [None]:
# Decision Tree Regression Model
from pyspark.ml.regression import DecisionTreeRegressor

dtr = DecisionTreeRegressor(featuresCol="features", labelCol="DEP_DELAY", maxDepth=5, maxBins=max_num_of_categorical_features)

# Create the pipeline
pipeline = Pipeline(stages=[indexer, encoder, assembler, dtr])

# Train the model
model = pipeline.fit(train)

# Evaluate the model
predictions = model.transform(test)

# Print the RMSE
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="DEP_DELAY", metricName="rmse")
print("RMSE (Decision Tree Regression):", evaluator.evaluate(predictions))