In [None]:
import os
import atexit
import sys

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=15
tasks_per_node=8 
memory_per_task=1024 #1 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="8:00" #1 hour
os.environ['SBATCH_PARTITION']='lattice' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

sj.wait_to_start()
sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)

# Load data

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.sql import functions as F
spark = SparkSession.builder.appName('dfTest').getOrCreate()
df = spark.read.csv('Data/AirOnTimeCSV/airOT20*',inferSchema=True,header=True)

In [None]:
# remove empty last column and print schema
df = df.select(df.columns[:44])
#print(df.count())
df.printSchema()

In [None]:
# limit to just flights out of Houston (IAH)
df = df.where(df.ORIGIN == "IAH")
df.show(5)

In [None]:
print(df.count())

# Only delayed flights
I only did this preprocessing because it is what we talked about, but I don't think we should use this for training the model. See all arrived flights

In [None]:
# filter for only delayed flights
deldf = df.where(df.ARR_DELAY_NEW > 0)
deldf.show(5)
#print(deldf.count())

In [None]:
# Select features and label
deldf = deldf.select("YEAR", "MONTH", "DAY_OF_WEEK", "UNIQUE_CARRIER", \
               "DEST", "DEST_STATE_ABR", "CRS_DEP_TIME",\
               "CRS_ELAPSED_TIME", "DISTANCE", "ARR_DELAY_NEW")
deldf.show(10)

In [None]:
# check number of rows
print(deldf.count())

In [None]:
# only take last 5 years of data to reduce size of dataset so it can be processes on one machine
deldf = deldf.where(deldf.YEAR > 2007)
deldf.show(10)

In [None]:
# check number of rows
print(deldf.count())

In [None]:
# save features and label in files
years = [2012-x for x in range (5)] # list of years in reverse.

for year in years:
        filename = "FeaturesLabels" + str(year) + ".csv"
        print(filename)
        deldf.where((deldf.YEAR == year)).toPandas().to_csv("./Data/preprocR1-2/onlyDelayed/"+filename, header=True)

# All Arrived Flights
Using this data instead of just delayed flights allows us to check early flights too.
It also makes more sense because the model is not biased to think that all fights are delayed

In [None]:
# filter for arrived flights
arrdf = df.where(~F.isnull(df.ARR_DELAY))
arrdf.show(5)
#print(arrdf.count())

In [None]:
# Select features and label
arrdf = arrdf.select("YEAR", "MONTH", "DAY_OF_WEEK", "UNIQUE_CARRIER", \
               "DEST", "DEST_STATE_ABR", "CRS_DEP_TIME",\
               "CRS_ELAPSED_TIME", "DISTANCE", "ARR_DELAY")
arrdf.show(10)

In [None]:
# check number of rows
print(arrdf.count())

In [None]:
# only take last 3 years of data to reduce size of dataset so it can be processes on one machine
arrdf = arrdf.where(arrdf.YEAR > 2009)
arrdf.show(10)

In [None]:
# check number of rows
print(arrdf.count())

In [None]:
# save features and label in files
years = [2012-x for x in range (3)] # list of years in reverse.

for year in years:
        filename = "FeaturesLabels" + str(year) + ".csv"
        print(filename)
        arrdf.where((arrdf.YEAR == year)).toPandas().to_csv("./Data/preprocR1-2/allArrivedFlights/"+filename, header=True)

# Delays Greater than 15 with delay types

In [None]:
# filter for only delayed flights
del15df = df.where(df.ARR_DEL15 > 0.5) 
del15df.show(5)
#print(del15df.count())

In [None]:
# Select features and label
del15df = del15df.select("YEAR", "MONTH", "DAY_OF_WEEK", "UNIQUE_CARRIER", \
               "DEST", "DEST_STATE_ABR", "CRS_DEP_TIME",\
               "CRS_ELAPSED_TIME", "DISTANCE","CARRIER_DELAY", \
               "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY", "ARR_DELAY")
del15df.show(10)

In [None]:
# check number of rows
print(del15df.count())

In [None]:
# only take data where the delay types are recorded (started recording at 6/2003)
del15df = del15df.where(del15df.YEAR > 2003)
del15df.show(10)

In [None]:
# check number of rows
print(del15df.count())

In [None]:
# save features and label in files
years = [2012-x for x in range (9)] # list of years in reverse.

for year in years:
        filename = "FeaturesLabels" + str(year) + ".csv"
        print(filename)
        del15df.where((del15df.YEAR == year)).toPandas().to_csv("./Data/preprocR1-2/delay15/"+filename, header=True)

# Data Cleaning Checks

In [None]:
# check for NULL values in original data
df.select([F.count(when(F.isnull(c), c)).alias(c) for c in df.columns]).show()

In [None]:
# check for NULL values and Nan values in delayed flights final data
deldf.select([F.count(when(F.isnan(c), c)).alias(c) for c in deldf.columns]).show()
deldf.select([F.count(when(F.isnull(c), c)).alias(c) for c in deldf.columns]).show()

In [None]:
# check for NULL values and Nan values in all arrived flights final data
arrdf.select([F.count(when(F.isnan(c), c)).alias(c) for c in arrdf.columns]).show()
arrdf.select([F.count(when(F.isnull(c), c)).alias(c) for c in arrdf.columns]).show()

In [None]:
# check for NULL values and Nan values in all arrived flights final data
#del15df.select([F.count(when(F.isnan(c), c)).alias(c) for c in del15df.columns]).show() #There are no nans as of last check
del15df.select([F.count(when(F.isnull(c), c)).alias(c) for c in del15df.columns]).show()

# Temporary Tests

In [None]:
from pyspark.sql import functions as F
# Your data at the moment
data = sc.parallelize([ 
['Emily', 10, 1],
['Word', None, 5],
[None, 2, 6],
['', None, 8]
    ])

# # Define schema
# schema = StructType([
#     StructField("Name", StringType(), True),
#     StructField("Height", StringType(), True),
#     StructField("Age", StringType(), True)
# ])

# # Create dataframe
# test = sqlContext.createDataFrame(data_converted, schema)

test = sqlCtx.createDataFrame(data, ['name', 'height', 'age'])
# Output
test.show()
#test.filter(F.count(F.isnull(test.height)).show()
test.select([F.count(when(F.isnull(c), c)).alias(c) for c in test.columns]).show()