In [1]:
import os
import atexit
import sys

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=15
tasks_per_node=8 
memory_per_task=1024 #1 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="8:00" #1 hour
os.environ['SBATCH_PARTITION']='lattice' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

sj.wait_to_start()
sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)

INFO:sparkhpc.sparkjob:Submitted batch job 676269

INFO:sparkhpc.sparkjob:Submitted cluster 1


# Load data

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.sql import functions as F
spark = SparkSession.builder.appName('dfTest').getOrCreate()
df = spark.read.csv('Data/AirOnTimeCSV/airOT20*',inferSchema=True,header=True)

In [3]:
# remove empty last column and print schema
df = df.select(df.columns[:44])
#print(df.count())
df.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY_OF_MONTH: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- FL_DATE: timestamp (nullable = true)
 |-- UNIQUE_CARRIER: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- FL_NUM: integer (nullable = true)
 |-- ORIGIN_AIRPORT_ID: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_STATE_ABR: string (nullable = true)
 |-- DEST_AIRPORT_ID: integer (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_STATE_ABR: string (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DEP_TIME: integer (nullable = true)
 |-- DEP_DELAY: double (nullable = true)
 |-- DEP_DELAY_NEW: double (nullable = true)
 |-- DEP_DEL15: double (nullable = true)
 |-- DEP_DELAY_GROUP: integer (nullable = true)
 |-- TAXI_OUT: double (nullable = true)
 |-- WHEELS_OFF: string (nullable = true)
 |-- WHEELS_ON: string (nullable = true)
 |-- TAXI_IN: 

In [4]:
# limit to just flights out of Houston (IAH)
df = df.where(df.ORIGIN == "IAH")
df.show(5)

+----+-----+------------+-----------+-------------------+--------------+--------+------+-----------------+------+----------------+---------------+----+--------------+------------+--------+---------+-------------+---------+---------------+--------+----------+---------+-------+------------+--------+---------+-------------+---------+---------------+---------+-----------------+--------+----------------+-------------------+--------+-------+--------+--------------+-------------+-------------+---------+--------------+-------------------+
|YEAR|MONTH|DAY_OF_MONTH|DAY_OF_WEEK|            FL_DATE|UNIQUE_CARRIER|TAIL_NUM|FL_NUM|ORIGIN_AIRPORT_ID|ORIGIN|ORIGIN_STATE_ABR|DEST_AIRPORT_ID|DEST|DEST_STATE_ABR|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|DEP_DELAY_NEW|DEP_DEL15|DEP_DELAY_GROUP|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|ARR_DELAY_NEW|ARR_DEL15|ARR_DELAY_GROUP|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|FLIGHTS|DISTANCE|DISTANCE_GROUP|

In [5]:
print(df.count())

2306830


# Delays greater than 15 with delay types

In [18]:
# filter for only delayed flights
del15df = df.where(df.ARR_DEL15 > 0.5) 
del15df.show(5)
#print(del15df.count())

+----+-----+------------+-----------+-------------------+--------------+--------+------+-----------------+------+----------------+---------------+----+--------------+------------+--------+---------+-------------+---------+---------------+--------+----------+---------+-------+------------+--------+---------+-------------+---------+---------------+---------+-----------------+--------+----------------+-------------------+--------+-------+--------+--------------+-------------+-------------+---------+--------------+-------------------+
|YEAR|MONTH|DAY_OF_MONTH|DAY_OF_WEEK|            FL_DATE|UNIQUE_CARRIER|TAIL_NUM|FL_NUM|ORIGIN_AIRPORT_ID|ORIGIN|ORIGIN_STATE_ABR|DEST_AIRPORT_ID|DEST|DEST_STATE_ABR|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|DEP_DELAY_NEW|DEP_DEL15|DEP_DELAY_GROUP|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|ARR_DELAY_NEW|ARR_DEL15|ARR_DELAY_GROUP|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|FLIGHTS|DISTANCE|DISTANCE_GROUP|

In [19]:
# Select features and label
del15df = del15df.select("YEAR", "MONTH", "DAY_OF_WEEK", "UNIQUE_CARRIER", \
               "DEST", "DEST_STATE_ABR", "CRS_DEP_TIME", "CRS_ARR_TIME", \
               "DISTANCE","CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", \
               "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY")
del15df.show(10)

+----+-----+-----------+--------------+----+--------------+------------+------------+--------+-------------+-------------+---------+--------------+-------------------+
|YEAR|MONTH|DAY_OF_WEEK|UNIQUE_CARRIER|DEST|DEST_STATE_ABR|CRS_DEP_TIME|CRS_ARR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+----+-----+-----------+--------------+----+--------------+------------+------------+--------+-------------+-------------+---------+--------------+-------------------+
|2006|    8|          2|            AA| ORD|            IL|        1546|        1830|   925.0|          0.0|          0.0|      0.0|           0.0|               47.0|
|2006|    8|          3|            AA| ORD|            IL|        1546|        1830|   925.0|          0.0|          0.0|     29.0|           0.0|                0.0|
|2006|    8|          4|            AA| ORD|            IL|        1546|        1830|   925.0|          0.0|          2.0|      0.0|           0.0|             

In [8]:
# check number of rows
print(del15df.count())

456815


In [20]:
# only take data where the delay types are recorded (started recording at 6/2003)
del15df = del15df.where(del15df.YEAR > 2003)
del15df.show(10)

+----+-----+-----------+--------------+----+--------------+------------+------------+--------+-------------+-------------+---------+--------------+-------------------+
|YEAR|MONTH|DAY_OF_WEEK|UNIQUE_CARRIER|DEST|DEST_STATE_ABR|CRS_DEP_TIME|CRS_ARR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+----+-----+-----------+--------------+----+--------------+------------+------------+--------+-------------+-------------+---------+--------------+-------------------+
|2006|    8|          2|            AA| ORD|            IL|        1546|        1830|   925.0|          0.0|          0.0|      0.0|           0.0|               47.0|
|2006|    8|          3|            AA| ORD|            IL|        1546|        1830|   925.0|          0.0|          0.0|     29.0|           0.0|                0.0|
|2006|    8|          4|            AA| ORD|            IL|        1546|        1830|   925.0|          0.0|          2.0|      0.0|           0.0|             

In [10]:
# check number of rows
print(del15df.count())

358823


In [21]:
# convert label columns (delay types) to boolean rather than the delay time. 
del15df = del15df.withColumn("CARRIER_DELAY_LABEL", when(del15df.CARRIER_DELAY > 0, 1).otherwise(0))
del15df = del15df.withColumn("WEATHER_DELAY_LABEL", when(del15df.WEATHER_DELAY > 0, 1).otherwise(0))
del15df = del15df.withColumn("NAS_DELAY_LABEL", when(del15df.NAS_DELAY > 0, 1).otherwise(0))
del15df = del15df.withColumn("SECURITY_DELAY_LABEL", when(del15df.SECURITY_DELAY > 0, 1).otherwise(0))
del15df = del15df.withColumn("LATE_AIRCRAFT_DELAY_LABEL", when(del15df.LATE_AIRCRAFT_DELAY > 0, 1).otherwise(0))

del15df = del15df.select("YEAR", "MONTH", "DAY_OF_WEEK", "UNIQUE_CARRIER", \
                "DEST", "DEST_STATE_ABR", "CRS_DEP_TIME", "CRS_ARR_TIME", \
                "DISTANCE", \
                F.col("CARRIER_DELAY_LABEL").alias("CARRIER_DELAY"), \
                F.col("WEATHER_DELAY_LABEL").alias("WEATHER_DELAY"), \
                F.col("NAS_DELAY_LABEL").alias("NAS_DELAY"), \
                F.col("SECURITY_DELAY_LABEL").alias("SECURITY_DELAY"), \
                F.col("LATE_AIRCRAFT_DELAY_LABEL").alias("LATE_AIRCRAFT_DELAY"))

del15df.show(10)

+----+-----+-----------+--------------+----+--------------+------------+------------+--------+-------------+-------------+---------+--------------+-------------------+
|YEAR|MONTH|DAY_OF_WEEK|UNIQUE_CARRIER|DEST|DEST_STATE_ABR|CRS_DEP_TIME|CRS_ARR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+----+-----+-----------+--------------+----+--------------+------------+------------+--------+-------------+-------------+---------+--------------+-------------------+
|2006|    8|          2|            AA| ORD|            IL|        1546|        1830|   925.0|            0|            0|        0|             0|                  1|
|2006|    8|          3|            AA| ORD|            IL|        1546|        1830|   925.0|            0|            0|        1|             0|                  0|
|2006|    8|          4|            AA| ORD|            IL|        1546|        1830|   925.0|            0|            1|        0|             0|             

In [25]:
# save features and label in files
years = [2012-x for x in range (9)] # list of years in reverse.

for year in years:
        filename = "FeaturesLabels" + str(year) + ".csv"
        print(filename)
        del15df.where((del15df.YEAR == year)).toPandas().to_csv("./Data/preprocR2-1/"+filename, header=True)

FeaturesLabels2012.csv
FeaturesLabels2011.csv
FeaturesLabels2010.csv
FeaturesLabels2009.csv
FeaturesLabels2008.csv
FeaturesLabels2007.csv
FeaturesLabels2006.csv
FeaturesLabels2005.csv
FeaturesLabels2004.csv


# Data Cleaning Checks

In [None]:
# check for NULL values in original data
df.select([F.count(when(F.isnull(c), c)).alias(c) for c in df.columns]).show()

In [24]:
# check for NULL values and Nan values in delayed flights final data
del15df.select([F.count(when(F.isnan(c), c)).alias(c) for c in del15df.columns]).show()
del15df.select([F.count(when(F.isnull(c), c)).alias(c) for c in del15df.columns]).show()

+----+-----+-----------+--------------+----+--------------+------------+------------+--------+-------------+-------------+---------+--------------+-------------------+
|YEAR|MONTH|DAY_OF_WEEK|UNIQUE_CARRIER|DEST|DEST_STATE_ABR|CRS_DEP_TIME|CRS_ARR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|
+----+-----+-----------+--------------+----+--------------+------------+------------+--------+-------------+-------------+---------+--------------+-------------------+
|   0|    0|          0|             0|   0|             0|           0|           0|       0|            0|            0|        0|             0|                  0|
+----+-----+-----------+--------------+----+--------------+------------+------------+--------+-------------+-------------+---------+--------------+-------------------+

+----+-----+-----------+--------------+----+--------------+------------+------------+--------+-------------+-------------+---------+--------------+------------