# Airline Arrivals #

* I will formulate this as a regression predicting excess lateness over 30 minutes.

* Flights late 30 minutes or less are coded as 0 minutes late.

* Dataset is from 1987 flights.

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

import warnings

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [2]:
pd.options.display.float_format = '{:.3f}'.format

In [3]:
airline = pd.read_csv("../data/1987.csv")
airline.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1987,10,14,3,741.0,730,912.0,849,PS,1451,...,,,0,,0,,,,,
1,1987,10,15,4,729.0,730,903.0,849,PS,1451,...,,,0,,0,,,,,
2,1987,10,17,6,741.0,730,918.0,849,PS,1451,...,,,0,,0,,,,,
3,1987,10,18,7,729.0,730,847.0,849,PS,1451,...,,,0,,0,,,,,
4,1987,10,19,1,749.0,730,922.0,849,PS,1451,...,,,0,,0,,,,,


## Variable Descriptions ##


### Name	Description  ###

1	Year	1987-2008

2	Month	1-12

3	DayofMonth	1-31

4	DayOfWeek	1 (Monday) - 7 (Sunday)

5	DepTime	actual departure time (local, hhmm)

6	CRSDepTime	scheduled departure time (local, hhmm)

7	ArrTime	actual arrival time (local, hhmm)

8	CRSArrTime	scheduled arrival time (local, hhmm)

9	UniqueCarrier	unique carrier code

10	FlightNum	flight number

11	TailNum	plane tail number

12	ActualElapsedTime	in minutes

13	CRSElapsedTime	in minutes

14	AirTime	in minutes

15	ArrDelay	arrival delay, in minutes

16	DepDelay	departure delay, in minutes

17	Origin	origin IATA airport code

18	Dest	destination IATA airport code

19	Distance	in miles

20	TaxiIn	taxi in time, in minutes

21	TaxiOut	taxi out time in minutes

22	Cancelled	was the flight cancelled?

23	CancellationCode	reason for cancellation (A = carrier, B = weather, C = NAS, D = security)

24	Diverted	1 = yes, 0 = no

25	CarrierDelay	in minutes

26	WeatherDelay	in minutes

27	NASDelay	in minutes

28	SecurityDelay	in minutes

29	LateAircraftDelay	in minutes

In [4]:
airline.describe()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,TailNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,1311826.0,1311826.0,1311826.0,1311826.0,1292141.0,1311826.0,1288326.0,1311826.0,1311826.0,0.0,...,0.0,0.0,1311826.0,0.0,1311826.0,0.0,0.0,0.0,0.0,0.0
mean,1987.0,10.994,15.718,3.949,1369.276,1361.131,1493.152,1491.052,670.143,,...,,,0.015,,0.003,,,,,
std,0.0,0.823,8.836,1.986,478.718,471.97,498.75,486.8,514.551,,...,,,0.122,,0.054,,,,,
min,1987.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,...,,,0.0,,0.0,,,,,
25%,1987.0,10.0,8.0,2.0,946.0,940.0,1117.0,1115.0,291.0,,...,,,0.0,,0.0,,,,,
50%,1987.0,11.0,16.0,4.0,1345.0,1338.0,1521.0,1519.0,560.0,,...,,,0.0,,0.0,,,,,
75%,1987.0,12.0,23.0,6.0,1750.0,1742.0,1918.0,1912.0,894.0,,...,,,0.0,,0.0,,,,,
max,1987.0,12.0,31.0,7.0,2400.0,2359.0,2400.0,2400.0,6282.0,,...,,,1.0,,1.0,,,,,


In [5]:
X = airline[['Month','DayOfWeek','CRSDepTime','CRSArrTime', 'UniqueCarrier','Origin',
             'Dest','Distance','DepDelay','Cancelled','Diverted']]
y = np.where(airline.ArrDelay > 30, airline.ArrDelay - 30, 0)            

In [6]:
X.head()

Unnamed: 0,Month,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,Origin,Dest,Distance,DepDelay,Cancelled,Diverted
0,10,3,730,849,PS,SAN,SFO,447.0,11.0,0,0
1,10,4,730,849,PS,SAN,SFO,447.0,-1.0,0,0
2,10,6,730,849,PS,SAN,SFO,447.0,11.0,0,0
3,10,7,730,849,PS,SAN,SFO,447.0,-1.0,0,0
4,10,1,730,849,PS,SAN,SFO,447.0,19.0,0,0


In [7]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

X = MultiColumnLabelEncoder(columns = ['UniqueCarrier','Origin','Dest']).fit_transform(X)

In [8]:
y[:11]

array([0., 0., 0., 0., 3., 0., 0., 0., 0., 0., 0.])

In [9]:
X = X.fillna(X.mean())

In [10]:
X.describe()

Unnamed: 0,Month,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,Origin,Dest,Distance,DepDelay,Cancelled,Diverted
count,1311826.0,1311826.0,1311826.0,1311826.0,1311826.0,1311826.0,1311826.0,1311826.0,1311826.0,1311826.0,1311826.0
mean,10.994,3.949,1361.131,1491.052,6.007,116.861,116.836,591.244,8.064,0.015,0.003
std,0.823,1.986,471.97,486.8,4.172,67.295,67.248,497.928,23.844,0.122,0.054
min,10.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,-1345.0,0.0,0.0
25%,10.0,2.0,940.0,1115.0,3.0,57.0,57.0,247.0,0.0,0.0,0.0
50%,11.0,4.0,1338.0,1519.0,6.0,122.0,122.0,417.0,0.0,0.0,0.0
75%,12.0,6.0,1742.0,1912.0,10.0,173.0,173.0,787.0,8.0,0.0,0.0
max,12.0,7.0,2359.0,2400.0,13.0,236.0,236.0,4983.0,1439.0,1.0,1.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
clr1 = ensemble.RandomForestRegressor()
cv1 = cross_val_score(clr1, X_train, y_train, cv=5)
print(cv1)
print("mean = {:.3}".format(cv1.mean()))

[0.88731795 0.86678607 0.89239609 0.86303431 0.86727508]
mean = 0.875


In [34]:
clr2 = ensemble.RandomForestRegressor(n_estimators=30, max_depth=3)
cv2 = cross_val_score(clr2, X_train, y_train, cv=5)
print(cv2)
print("mean = {:.3}".format(cv2.mean()))

[0.82964645 0.80443234 0.85167395 0.83678404 0.83996293]
mean = 0.832


In [15]:
clr3 = ensemble.RandomForestRegressor(n_estimators=30)
cv3 = cross_val_score(clr3, X_train, y_train, cv=5)
print(cv3)
print("mean = {:.3}".format(cv3.mean()))

[0.8934482  0.87201499 0.8992104  0.87741256 0.87358556]
mean = 0.883


In [13]:
# We'll make 100 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 100,
          'max_depth': 2,
          'loss': 'lad'}

# Initialize and fit the model.
gbr1 = ensemble.GradientBoostingRegressor(**params)
gb1 = cross_val_score(gbr1, X_train, y_train, cv=5)
print(gb1)
print("mean = {:.3}".format(gb1.mean()))


[0.70017519 0.67295257 0.72517479 0.72407359 0.72355387]
mean = 0.709


### Results ###

* clr3 wins.

* We will now run clr3 on the out of sample test dataset.


In [18]:
clr3.fit(X_train, y_train)
predict = clr3.predict(X_test)
r2 = clr3.score(X_test, y_test)
print("Final Out of Sample Test R2 is {:.3f}".format(r2))

Final Out of Sample Test R2 is 0.877


### Final Test Results ###

* Random Forest regression model with 30 trees works well out of sample.

* R2 result of 87.7% on test data.