### Create a model to predict the flight delay over 15 minutes (ARR_DEL15) 


In [4]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark ML Class Assisgnment") \
    .getOrCreate()

In [18]:
# read the file
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType
df = (spark.read.format("csv").
  option("header", "true").
  option("nullValue", "NA").
  option("inferSchema", True).
  load("flight_weather_small.csv"))

In this dataset,

ARR_DEL15 : 1 when the flight is delayed over 15 minutes, 0 otherwise.
XXXOrigin : Weather conditions in departure airport.
XXXDest : Weather conditions in destination airport.

In [20]:
# check sample data from table
print(f"There are {df.count()} rows in df. This is:")
df.head(2)

There are 99999 rows in df. This is:


[Row(X.1=1, YEAR=2012, MONTH=1, DAY_OF_MONTH=4, DAY_OF_WEEK=3, FL_DATE=datetime.date(2012, 1, 4), UNIQUE_CARRIER='AA', TAIL_NUM='N320AA', FL_NUM=1, ORIGIN_AIRPORT_ID=12478, ORIGIN='JFK', ORIGIN_STATE_ABR='NY', DEST_AIRPORT_ID=12892, DEST='LAX', DEST_STATE_ABR='CA', CRS_DEP_TIME=9, DEP_TIME=904, DEP_DELAY=4, DEP_DELAY_NEW=4, DEP_DEL15=0, DEP_DELAY_GROUP=0, TAXI_OUT=18, WHEELS_OFF=922, WHEELS_ON=1131, TAXI_IN=20, CRS_ARR_TIME=12, ARR_TIME=1151, ARR_DELAY=-34, ARR_DELAY_NEW=0, ARR_DEL15=0, ARR_DELAY_GROUP=-2, CANCELLED=0, CANCELLATION_CODE='', DIVERTED=0, CRS_ELAPSED_TIME=385, ACTUAL_ELAPSED_TIME=347, AIR_TIME=309, FLIGHTS=1, DISTANCE=2475, DISTANCE_GROUP=10, CARRIER_DELAY=None, WEATHER_DELAY=None, NAS_DELAY=None, SECURITY_DELAY=None, LATE_AIRCRAFT_DELAY=None, X=None, VisibilityOrigin=10.0, DryBulbCelsiusOrigin=-3.9, DewPointCelsiusOrigin=-19.4, RelativeHumidityOrigin=29.0, WindSpeedOrigin=7.0, AltimeterOrigin=30.1, VisibilityDest=10.0, DryBulbCelsiusDest=17.2, DewPointCelsiusDest=10.6, R

In [11]:
# Mark as "delayed over 15 minutes" if it's canceled.

In [12]:
from pyspark.sql.functions import when
df = df.withColumn("ARR_DEL15", when(df["CANCELLED"] == 1, 1).otherwise(df["ARR_DEL15"]))

In [14]:
#Remove flights if it's diverted.
df = df.filter(df["DIVERTED"] != 1)

In [17]:
df.count()

99821

#### Narrow to required columns.

"ARR_DEL15",
  "MONTH",
  "DAY_OF_WEEK",
  "UNIQUE_CARRIER",
  "ORIGIN",
  "DEST",
  "CRS_DEP_TIME",
  "CRS_ARR_TIME",
  "RelativeHumidityOrigin",
  "AltimeterOrigin",
  "DryBulbCelsiusOrigin",
  "WindSpeedOrigin",
  "VisibilityOrigin",
  "DewPointCelsiusOrigin",
  "RelativeHumidityDest",
  "AltimeterDest",
  "DryBulbCelsiusDest",
  "WindSpeedDest",
  "VisibilityDest",
  "DewPointCelsiusDest"

In [21]:
# Select required columns
df = df.select("ARR_DEL15", "MONTH", "DAY_OF_WEEK", "UNIQUE_CARRIER", "ORIGIN", "DEST", "CRS_DEP_TIME", "CRS_ARR_TIME", "RelativeHumidityOrigin", "AltimeterOrigin", "DryBulbCelsiusOrigin", "WindSpeedOrigin", "VisibilityOrigin", "DewPointCelsiusOrigin", "RelativeHumidityDest", "AltimeterDest", "DryBulbCelsiusDest", "WindSpeedDest", "VisibilityDest", "DewPointCelsiusDest")

In [22]:
# Drop rows with null value
df = df.na.drop()

In [23]:
# Split data into training data and evaluation data (ratio is 80% : 20%).
train, test = df.randomSplit(weights=[0.8, 0.2], seed=123)

#### Convert categorical values to index values (0, 1, ...) for the following columns.

Carrier code (UNIQUE_CARRIER)
Airport code in departure (ORIGIN)
Airport code in destination (DEST)
Flag (0 or 1) for delay over 15 minutes (ARR_DEL15)

hint: pyspark.ml.feature check StringIndexer transformer

In [25]:
from pyspark.ml.feature import StringIndexer
uniqueCarrierIndexer = StringIndexer(inputCol="UNIQUE_CARRIER", outputCol="Indexed_UNIQUE_CARRIER").fit(df)
originIndexer = StringIndexer(inputCol="ORIGIN", outputCol="Indexed_ORIGIN").fit(df)
destIndexer = StringIndexer(inputCol="DEST", outputCol="Indexed_DEST").fit(df)
arrDel15Indexer = StringIndexer(inputCol="ARR_DEL15", outputCol="Indexed_ARR_DEL15").fit(df)

#### In Spark machine learning, the feature columns must be wrapped as a single vector value.

So create new vector column named "features".

Hint: pyspark.ml.feature check VectorAssembler

In [26]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
  inputCols = [
    "MONTH",
    "DAY_OF_WEEK",
    "Indexed_UNIQUE_CARRIER",
    "Indexed_ORIGIN",
    "Indexed_DEST",
    "CRS_DEP_TIME",
    "CRS_ARR_TIME",
    "RelativeHumidityOrigin",
    "AltimeterOrigin",
    "DryBulbCelsiusOrigin",
    "WindSpeedOrigin",
    "VisibilityOrigin",
    "DewPointCelsiusOrigin",
    "RelativeHumidityDest",
    "AltimeterDest",
    "DryBulbCelsiusDest",
    "WindSpeedDest",
    "VisibilityDest",
    "DewPointCelsiusDest"],
  outputCol = "features")

#### Generate classifier. Here we use Decision Tree classifier.

Hint: From pyspark.ml.classification check DecisionTreeClassifier

In [32]:
from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(seed=123, labelCol='Indexed_ARR_DEL15', maxBins=250)

#### Generate SparkML pipeline and run training.
Trained model (with coefficients) and pipeline are stored in the variable "model".

In [33]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[uniqueCarrierIndexer, originIndexer, destIndexer, arrDel15Indexer, assembler, classifier])
model = pipeline.fit(train)

#### Predict with eveluation data.

In [34]:
# Predict with eveluation data
pred = model.transform(test)
pred.show()

+---------+-----+-----------+--------------+------+----+------------+------------+----------------------+---------------+--------------------+---------------+----------------+---------------------+--------------------+-------------+------------------+-------------+--------------+-------------------+----------------------+--------------+------------+-----------------+--------------------+----------------+--------------------+----------+
|ARR_DEL15|MONTH|DAY_OF_WEEK|UNIQUE_CARRIER|ORIGIN|DEST|CRS_DEP_TIME|CRS_ARR_TIME|RelativeHumidityOrigin|AltimeterOrigin|DryBulbCelsiusOrigin|WindSpeedOrigin|VisibilityOrigin|DewPointCelsiusOrigin|RelativeHumidityDest|AltimeterDest|DryBulbCelsiusDest|WindSpeedDest|VisibilityDest|DewPointCelsiusDest|Indexed_UNIQUE_CARRIER|Indexed_ORIGIN|Indexed_DEST|Indexed_ARR_DEL15|            features|   rawPrediction|         probability|prediction|
+---------+-----+-----------+--------------+------+----+------------+------------+----------------------+---------------

#### Show evaluation result.

Hint: pyspark.ml.evaluation check MulticlassClassificationEvaluator

In [39]:
# Evaluate results
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Indexed_ARR_DEL15')
evaluator.evaluate(pred)

0.7601886270228939

In [40]:
# Save pipeline
model.write().overwrite().save("spark-ml-assignment-dt-model")