In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Define the location of the Data
path = '../data/'

# Import All Data

In [6]:
#import the 2017 Data
df = pd.read_csv(f'{path}/interim/2017New.csv')

In [7]:
#Let's remove the features that will not be available for predicting flights
df = df[['Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Reporting_Airline', 'Tail_Number', 'Flight_Number_Reporting_Airline',
       'Origin', 'OriginState', 'DestState', 'Dest', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime',
       'ArrDelay', 'Distance', 'DistanceGroup','tempF','wind','ave_vis','precip_sum']]
airlines = list(df.Reporting_Airline.unique())

# Feature Engineering

## Hourly Bins

In [8]:
#Let's convert the DEP_TIME and ARR_TIME into hours only. 
df['ARR_HOUR'] = df['CRSArrTime'].apply(lambda x: int(str(x)[:2]) if x>999 else int(str(x)[:1] if x>99 else 0))
df['ARR_HOUR'] = df['ARR_HOUR'].apply(lambda x:0 if x==24 else x)
df['DEP_HOUR'] = df['CRSDepTime'].apply(lambda x: int(str(x)[:2]) if x>999 else int(str(x)[:1] if x>99 else 0))
df = df.drop(columns=['CRSDepTime', 'CRSArrTime'])

## Direction of Flight

In [9]:
#let's add a direction of flight features
#import airport location information csv
airport_location = pd.read_csv(f'{path}/interim/airport_loc.csv')
#combine Latitude and Longitude into a tuple and form a new column and drop the Latitude & Longitude columns
airport_location['location'] = list(zip(airport_location.Latitude, airport_location.Longitude))
airports_locs = airport_location.copy()
airports_locs.head()
airports_locs.drop(['Latitude', 'Longitude'], axis=1, inplace=True)
airport_loc = airports_locs.set_index('locationID').to_dict()['location']

#create two new columns with location coordinates of origin and destination airports
df['ORIGIN_LOC'] = df.Origin.apply(lambda x: airport_loc.get(x))
df['DEST_LOC'] = df.Dest.apply(lambda x: airport_loc.get(x))

#turn into an array so we can subtract and determine which direction the plane is traveling
df.ORIGIN_LOC = df.ORIGIN_LOC.apply(lambda x: np.asarray(x))
df.DEST_LOC = df.DEST_LOC.apply(lambda x: np.asarray(x))
df['DIRECTION'] = df.ORIGIN_LOC - df.DEST_LOC

df = df.dropna(subset=['DIRECTION'])
#(n/s - increasing to the north : e/w - increasing to the west)
# if first number is negative = south - north = plane is going north
# if second number is negative = east - west = plane is going west

def direction(latlong):
    if latlong[0] < 0:
        direction1 = 'North'
    else:
        direction1 = 'South'
    if latlong[1] < 0:
        direction2 = 'West'
    else:
        direction2 = 'East'
    if abs(latlong[0]) > abs(latlong[1]):
        dom_dir = direction1
    else:
        dom_dir = direction2
    return dom_dir      
df['DOM_DIRECTION'] = df.DIRECTION.apply(direction)

#clean up and delete the columns created for this tranformation
df = df.drop(columns=['ORIGIN_LOC', 'DEST_LOC', 'DIRECTION'])

## Hourly Flight Frequency

In [10]:
#create a dataframe with multiIndex for easy lookup
fltfrq_df = pd.DataFrame(df.groupby(['Origin', 'Month', 'DayofMonth', 'DayOfWeek', 'DEP_HOUR'])['Month'].count())

fltfrq_df_dict = fltfrq_df.to_dict()['Month']

#create a tuple column with all the required information to look up the flight frequency 
#in the newly created dictionary (fltfrq_df_dict)
df['temp'] = list(zip(df.Origin, df.Month, df.DayofMonth, df.DayOfWeek, df.DEP_HOUR))
df['flight_freq'] = df.temp.apply(lambda x: fltfrq_df_dict.get(x))
df = df.drop(columns=['temp'])

## Passenger Loading Data

In [11]:
#import the passenger load data
load_factor_df = pd.read_csv(f'{path}/interim/Loading2017.csv')
load_factor_df = load_factor_df[load_factor_df.CARRIER.isin(airlines)]
load_factor_df['LOAD_FACTOR'] = (load_factor_df.PASSENGERS * load_factor_df.DISTANCE) /\
(load_factor_df.SEATS * load_factor_df.DISTANCE)
look_up_factor = pd.DataFrame(load_factor_df.groupby(['MONTH', 'CARRIER', 'ORIGIN','DEST'])['LOAD_FACTOR'].mean())

In [12]:
load_factor_dict = look_up_factor.to_dict()['LOAD_FACTOR']

In [13]:
df['temp'] = list(zip(df.Month, df.Reporting_Airline, df.Origin, df.Dest))
df['LoadFactor'] = df.temp.apply(lambda x: load_factor_dict.get((x)))
df = df.drop(columns=['temp'])

## Plane Data from FAA

In [14]:
plane_info_df = pd.read_csv(f'{path}/interim/plane_info.csv')
plane_info_df['zip'] = list(zip(plane_info_df.mf_name, plane_info_df.mf_year,
                                plane_info_df.plane_model, plane_info_df.eng_model))

plane_info_df = plane_info_df.drop(columns=['mf_name', 'mf_year', 'plane_model', 'eng_model',
                                           'aw_date'])
#set teh reg_number as index
plane_info_df = plane_info_df.set_index('reg_number')

#construct a dictionary
plane_dict = plane_info_df.to_dict()['zip']

df['new'] = df.Tail_Number.apply(lambda x: plane_dict.get(x))
df = df.fillna(0)
df['new'] = df.new.apply(lambda x: (0,0,0,0) if x==0 else x)
df[['mf_name', 'mf_year', 'plane_model', 'eng_model']] = pd.DataFrame(df['new'].tolist(), index=df.index) 

df = df.drop(columns='new')
df.mf_year = pd.to_numeric(df.mf_year, errors='coerce')
df.mf_year = df.mf_year.fillna(2017)
df['plane_age'] = 2017 - df.mf_year

## Weather

https://mesonet.agron.iastate.edu/ASOS/

Weather data has been pre-processed in a separate notebook and imported in the flight dataset. We're using Origin airport with 5 hours sum prior to departure precipitation data, averaged 5-hour visibility data, hourly temperature data, and hourly wind data.

In [16]:
df.columns

Index(['Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Reporting_Airline', 'Tail_Number', 'Flight_Number_Reporting_Airline',
       'Origin', 'OriginState', 'DestState', 'Dest', 'CRSElapsedTime',
       'ArrDelay', 'Distance', 'DistanceGroup', 'tempF', 'wind', 'ave_vis',
       'precip_sum', 'ARR_HOUR', 'DEP_HOUR', 'DOM_DIRECTION', 'flight_freq',
       'LoadFactor', 'mf_name', 'mf_year', 'plane_model', 'eng_model',
       'plane_age'],
      dtype='object')

In [15]:
df.head()

Unnamed: 0,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,Origin,OriginState,...,ARR_HOUR,DEP_HOUR,DOM_DIRECTION,flight_freq,LoadFactor,mf_name,mf_year,plane_model,eng_model,plane_age
0,2,5,6,6,2017-05-06 00:00:00,WN,N7824A,4652,SJC,CA,...,8,6,East,17,0.745438,BOEING,2001.0,737-7BK,CFM56 SERIES,16.0
1,2,5,6,6,2017-05-06 00:00:00,WN,N8522P,4971,SJC,CA,...,18,17,East,7,0.745438,BOEING,2017.0,737-800,CFM56-7B27E/F,0.0
2,2,5,6,6,2017-05-06 00:00:00,WN,N8617E,5113,SJC,CA,...,12,11,East,8,0.745438,BOEING,2013.0,737-8H4,CFM56-7B27E,4.0
3,2,5,6,6,2017-05-06 00:00:00,WN,N450WN,5150,SJC,CA,...,21,20,East,3,0.745438,BOEING,2004.0,737-7H4,CFM56 SERIES,13.0
4,2,5,6,6,2017-05-06 00:00:00,WN,N498WN,5711,SJC,CA,...,14,13,East,10,0.745438,BOEING,2005.0,737-7H4,CFM56 SERIES,12.0


## Export CSV with Engineered Features

In [None]:
df.to_csv(f'{path}/interim/2017FeatEng.csv', index = False, encoding = 'utf-8')