In [1]:
#import libraries
import pandas as pd
import numpy as np

#import classes/functions
from sklearn import preprocessing

#constants
combinedFlights2018Parquet = "archive/Combined_Flights_2018.parquet"
combinedFlights2019Parquet = "archive/Combined_Flights_2019.parquet"
columns_to_use = [
    'Airline',
    'Origin',
    'Dest',
    'CRSDepTime', 
    'Distance', 
    'Year', 
    'Quarter', 
    'Month', 
    'DayofMonth', 
    'DayOfWeek', 
    'DepTimeBlk', 
    'ArrTimeBlk', 
    'DistanceGroup',
    'ArrDelayMinutes'
    ]

In [2]:
#load data
#data = pd.read_parquet(combinedFlights2018Parquet, columns = columns_to_use, engine="fastparquet")
#print(data.size);
#data2 = pd.read_parquet(combinedFlights2019Parquet, columns = columns_to_use, engine="fastparquet")
#print(data2.size);
data = pd.concat([pd.read_parquet(combinedFlights2018Parquet, columns = columns_to_use, engine="fastparquet"),pd.read_parquet(combinedFlights2019Parquet, columns = columns_to_use, engine="fastparquet")], axis = 0);
print(data.size);
print(list(data));
print(data.head());

192936744
['Airline', 'Origin', 'Dest', 'CRSDepTime', 'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTimeBlk', 'ArrTimeBlk', 'DistanceGroup', 'ArrDelayMinutes']
             Airline Origin Dest  CRSDepTime  Distance  Year  Quarter  Month  \
0  Endeavor Air Inc.    ABY  ATL        1202     145.0  2018        1      1   
1  Endeavor Air Inc.    ABY  ATL        1202     145.0  2018        1      1   
2  Endeavor Air Inc.    ABY  ATL        1202     145.0  2018        1      1   
3  Endeavor Air Inc.    ABY  ATL        1202     145.0  2018        1      1   
4  Endeavor Air Inc.    ABY  ATL        1400     145.0  2018        1      1   

   DayofMonth  DayOfWeek DepTimeBlk ArrTimeBlk  DistanceGroup  ArrDelayMinutes  
0          23          2  1200-1259  1300-1359              1              0.0  
1          24          3  1200-1259  1300-1359              1              0.0  
2          25          4  1200-1259  1300-1359              1              0.0  
3         

In [3]:
#drop rows with missing values
data.dropna(inplace = True)
#print(data.iloc[:,0].size);
#depTimeArray = np.zeros(np.array(data["DepTimeBlk"]).size);
#data["DepTimeBlk"] = depTimeArray;
#print(data["DepTimeBlk"]);

In [6]:
#Convert Blk times to integers
#iterator location for DepTimeBlk = 10 and ArrTimeBlk = 11
#This method of conversion takes < 30 minutes on the 2018 dataset, should be usable for our purposes
#Note: with labelencoder being very, very fast this method doesn't seem necessary anymore.
#depTimeArray = np.zeros(np.array(data["DepTimeBlk"]).size);
#arrTimeArray = np.zeros(np.array(data["DepTimeBlk"]).size);
#for i in range(np.array(data["DepTimeBlk"]).size):
#    depTimeArray[i] = int(data.iloc[i,10][0:4]);
#    arrTimeArray[i] = int(data.iloc[i,11][0:4]);
#    #much less efficient code:
#    #data.iloc[i,10] = int(data.iloc[i,10][0:4]);
#    #data.iloc[i,11] = int(data.iloc[i,11][0:4]);
#data["DepTimeBlk"] = depTimeArray;
#data["ArrTimeBlk"] = arrTimeArray;

In [4]:
#EncodeLabel
#
#This function will accept a dataFrame, and use a label encoder to convert the
#given feature to a set of integers.
#
#dataFrame should be the dataFrame which is going to be changed.
#featureName should be a string indicating the feature(column) to change in the dataFrame.
#
#This function assumes that all observations of the feature are strings, and that
#all empty observations have already been removed from the dataframe (for example, by
#using dropna).
#
#This function does not return anything, and the dataFrame passed in will be changed.
#However, this function will print a reference table for the original labels for the feature,
#and their corresponding integer representation.
def EncodeLabel(dataFrame, featureName, printTable = False):
    encoder = preprocessing.LabelEncoder();
    encoder.fit(pd.unique(data[featureName]));
    data[featureName] = encoder.transform(data[featureName]);
    if(printTable):
        print("Conversion table for feature \"" + featureName + "\":")
        for i in range(np.size(encoder.classes_)):
            print(encoder.classes_[i] + " = " + str(i));

In [5]:
#Encode all features which are a string
EncodeLabel(data, "Airline", printTable = True);
print("");
EncodeLabel(data, "Origin", printTable = True);
print("");
EncodeLabel(data, "Dest", printTable = True);
print("");
EncodeLabel(data, "DepTimeBlk", printTable = True);
print("");
EncodeLabel(data, "ArrTimeBlk", printTable = True);

Conversion table for feature "Airline":
Air Wisconsin Airlines Corp = 0
Alaska Airlines Inc. = 1
Allegiant Air = 2
American Airlines Inc. = 3
Cape Air = 4
Capital Cargo International = 5
Comair Inc. = 6
Commutair Aka Champlain Enterprises, Inc. = 7
Compass Airlines = 8
Delta Air Lines Inc. = 9
Empire Airlines Inc. = 10
Endeavor Air Inc. = 11
Envoy Air = 12
ExpressJet Airlines Inc. = 13
Frontier Airlines Inc. = 14
GoJet Airlines, LLC d/b/a United Express = 15
Hawaiian Airlines Inc. = 16
Horizon Air = 17
JetBlue Airways = 18
Mesa Airlines Inc. = 19
Peninsula Airways Inc. = 20
Republic Airlines = 21
SkyWest Airlines Inc. = 22
Southwest Airlines Co. = 23
Spirit Air Lines = 24
Trans States Airlines = 25
United Air Lines Inc. = 26
Virgin America = 27

Conversion table for feature "Origin":
ABE = 0
ABI = 1
ABQ = 2
ABR = 3
ABY = 4
ACK = 5
ACT = 6
ACV = 7
ACY = 8
ADK = 9
ADQ = 10
AEX = 11
AGS = 12
AKN = 13
ALB = 14
ALO = 15
ALW = 16
AMA = 17
ANC = 18
APN = 19
ART = 20
ASE = 21
ATL = 22
ATW = 23

Conversion table for feature "ArrTimeBlk":
0001-0559 = 0
0600-0659 = 1
0700-0759 = 2
0800-0859 = 3
0900-0959 = 4
1000-1059 = 5
1100-1159 = 6
1200-1259 = 7
1300-1359 = 8
1400-1459 = 9
1500-1559 = 10
1600-1659 = 11
1700-1759 = 12
1800-1859 = 13
1900-1959 = 14
2000-2059 = 15
2100-2159 = 16
2200-2259 = 17
2300-2359 = 18


In [16]:
#Testing for pandas unique and label encoder
print("Unique airlines:")
print(pd.unique(data["Airline"]));
print("");
print("number of airlines:");
print(np.size(pd.unique(data["Airline"])));
print("");
airlineEncoder = preprocessing.LabelEncoder();
airlineEncoder.fit(pd.unique(data["Airline"]));
print("airlines as seen from the encoder:");
print(airlineEncoder.classes_);
print("");
print("number of airlines from the encoder:");
print(np.size(airlineEncoder.classes_));
print("");
print("Data pre-transform:");
print(data["Airline"]);
print("");
data["Airline"] = airlineEncoder.transform(data["Airline"]);
print("Data post-transform:");
print(data["Airline"]);

Unique airlines:
['Endeavor Air Inc.' 'JetBlue Airways' 'ExpressJet Airlines Inc.'
 'Allegiant Air' 'Hawaiian Airlines Inc.' 'Spirit Air Lines'
 'SkyWest Airlines Inc.' 'Frontier Airlines Inc.' 'Empire Airlines Inc.'
 'Southwest Airlines Co.' 'Mesa Airlines Inc.' 'Republic Airlines'
 'Delta Air Lines Inc.' 'United Air Lines Inc.' 'Alaska Airlines Inc.'
 'Horizon Air' 'Commutair Aka Champlain Enterprises, Inc.'
 'Trans States Airlines' 'GoJet Airlines, LLC d/b/a United Express'
 'Air Wisconsin Airlines Corp' 'Cape Air' 'Compass Airlines'
 'Virgin America' 'Peninsula Airways Inc.' 'Envoy Air'
 'American Airlines Inc.' 'Comair Inc.' 'Capital Cargo International']

number of airlines:
28

airlines as seen from the encoder:
['Air Wisconsin Airlines Corp' 'Alaska Airlines Inc.' 'Allegiant Air'
 'American Airlines Inc.' 'Cape Air' 'Capital Cargo International'
 'Comair Inc.' 'Commutair Aka Champlain Enterprises, Inc.'
 'Compass Airlines' 'Delta Air Lines Inc.' 'Empire Airlines Inc.'
 'Endeavo