# Late Arrivals Airline Data 2008
## Setting up
### Basic Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Importing 2008 Airline Data
air08 = pd.read_csv('2008.csv')
air08.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,...,4.0,8.0,0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,...,5.0,10.0,0,,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,...,3.0,17.0,0,,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,...,3.0,7.0,0,,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0


In [3]:
# Checking Averages and Max and Min Values
air08.describe()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,...,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,7009728.0,7009728.0,7009728.0,7009728.0,6873482.0,7009728.0,6858079.0,7009728.0,7009728.0,6855029.0,...,7009728.0,6858079.0,6872670.0,7009728.0,7009728.0,1524735.0,1524735.0,1524735.0,1524735.0,1524735.0
mean,2008.0,6.37513,15.72801,3.924182,1333.83,1326.086,1481.258,1494.801,2224.2,127.3224,...,726.387,6.860852,16.45305,0.01960618,0.002463006,15.77206,3.039031,17.16462,0.07497434,20.77098
std,0.0,3.406737,8.797068,1.988259,478.0689,464.2509,505.2251,482.6728,1961.716,70.18731,...,562.1018,4.933649,11.3328,0.1386426,0.04956753,40.09912,19.50287,31.89495,1.83794,39.25964
min,2008.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,12.0,...,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2008.0,3.0,8.0,2.0,928.0,925.0,1107.0,1115.0,622.0,77.0,...,325.0,4.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2008.0,6.0,16.0,4.0,1325.0,1320.0,1512.0,1517.0,1571.0,110.0,...,581.0,6.0,14.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
75%,2008.0,9.0,23.0,6.0,1728.0,1715.0,1909.0,1907.0,3518.0,157.0,...,954.0,8.0,19.0,0.0,0.0,16.0,0.0,21.0,0.0,26.0
max,2008.0,12.0,31.0,7.0,2400.0,2359.0,2400.0,2400.0,9743.0,1379.0,...,4962.0,308.0,429.0,1.0,1.0,2436.0,1352.0,1357.0,392.0,1316.0


In [4]:
# Listing Columns
air08.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

## Setting Up Features

In [3]:
# Setting up Features Data Frame
features = pd.DataFrame()

In [4]:
# dropping Columns with to many missing values
features = air08.drop(['CancellationCode','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay'], axis=1)

In [5]:
# new data frame head
features.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,...,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,0
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,...,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,0
2,2008,1,3,4,628.0,620,804.0,750,WN,448,...,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,0
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,...,78.0,-6.0,-4.0,IND,BWI,515,3.0,7.0,0,0
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,77.0,34.0,34.0,IND,BWI,515,3.0,10.0,0,0


In [6]:
# finding remaining nulls
np.where(features['AirTime'].isnull())

(array([    178,     373,     399, ..., 7009564, 7009565, 7009648], dtype=int64),)

In [7]:
# dropping values
features = features.dropna()

### Adding New Features

In [8]:
# Creating Late Feature, Schedule Time Vs Actual Landing Time
features['Late'] = features['CRSArrTime'] - features['ArrTime']

In [9]:
# Creating Minutes Late Variable
features['mins_late'] = (features.CRSArrTime + 30) - features.ArrTime

### Checking Out Features

In [10]:
# Average Mins Late For a Flight
features['Late'].mean()

12.449300542200874

In [11]:
# Average Departure Delay
(features['DepTime'] - features['CRSDepTime']).mean()

8.73802586249151

In [12]:
# Making a 30 mins late categorical variable to balance data
features['late30'] = np.where(features['Late'] >=  30, 1, 0)

In [None]:
# Plotting Landing Delay Vs Mins Late
sns.lmplot('mins_late', 'ArrDelay', data=features, hue='Late',fit_reg=False)

In [13]:
# Total Number of Flights Later Than 30 Mins
features['late30'].value_counts()

0    6045416
1     809608
Name: late30, dtype: int64

## Balancing Sample

In [13]:
# importing Resample
from sklearn.utils import resample

In [14]:
# Separate majority and minority classes
feat_majority = features[features.late30==0]
feat_minority = features[features.late30==1]
 
# Downsample majority class
feat_majority_downsampled = resample(feat_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=809608, # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
feat_downsampled = pd.concat([feat_majority_downsampled, feat_minority])
 
# Display new class counts
feat_downsampled.late30.value_counts()


1    809608
0    809608
Name: late30, dtype: int64

In [15]:
feat_downsampled.drop(feat_downsampled[['Year','FlightNum', 'TailNum',]],axis=1,inplace=True)

In [16]:
# Separate input features (X) and target variable (y)
y = feat_downsampled.mins_late
X= feat_downsampled.drop(feat_downsampled[['mins_late','UniqueCarrier','Origin','Dest']],axis=1)

### Importing Sklearn Libs

In [17]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest



In [18]:
# splitting X and y into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Linear Model

In [19]:
# calling linear model
Lr = LinearRegression()

In [20]:
# fitting model
Lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
# testing model over 5 folds of data
cross_val_score(Lr.fit(X_train, y_train), X, y, cv=5)

array([ 1.,  1.,  1.,  1.,  1.])

## RFE and K-Best

Find the ranking of importants for our features

In [22]:
# See all feature ranks
nfeatures = 1

In [23]:
# setting up selector
rfe = RFE(Lr,nfeatures)

In [24]:
# fitting rfe
fit = rfe.fit(X,y)

In [32]:
# display of features importants
result_RFE = pd.DataFrame(list(fit.ranking_),index=X.columns)
result_RFE[0].sort_values()

ArrTime               1
CRSArrTime            2
Late                  3
TaxiIn                4
TaxiOut               5
AirTime               6
ActualElapsedTime     7
DayOfWeek             8
Month                 9
CRSElapsedTime       10
DayofMonth           11
ArrDelay             12
DepDelay             13
late30               14
DepTime              15
CRSDepTime           16
Cancelled            17
Distance             18
Diverted             19
Name: 0, dtype: int64

In [33]:
# setting up K-best to see P-Values
kbest = SelectKBest().fit(X,y)

  f = msb / msw
  f = msb / msw


In [35]:
# Putting results in a Dataframe
result_kb = pd.DataFrame(kbest.score_func(X,y)[1],index=X.columns)

  f = msb / msw
  f = msb / msw


In [49]:
# printing the P-Values for features
result_kb[0].sort_values()

Month                0.000000e+00
TaxiOut              0.000000e+00
TaxiIn               0.000000e+00
Distance             0.000000e+00
DepDelay             0.000000e+00
ArrDelay             0.000000e+00
AirTime              0.000000e+00
Late                 0.000000e+00
CRSElapsedTime       0.000000e+00
CRSArrTime           0.000000e+00
ArrTime              0.000000e+00
CRSDepTime           0.000000e+00
DepTime              0.000000e+00
ActualElapsedTime    0.000000e+00
late30               0.000000e+00
DayOfWeek            8.812974e-30
DayofMonth           5.149957e-01
Cancelled                     NaN
Diverted                      NaN
Name: 0, dtype: float64

In [42]:
# Removing last 4 features and re-establishing X and y
y = feat_downsampled.mins_late
X= feat_downsampled.drop(feat_downsampled[['DayofMonth', 'DayOfWeek','mins_late','Cancelled', 'Diverted', 'UniqueCarrier','Origin','Dest']],axis=1)

In [43]:
# splitting X and y into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Applying PCA

In [50]:
# standardizing data
ss = StandardScaler().fit_transform(X_train)

In [53]:
# using pca to select 1 components from our 15 remaining features
sklearn_pca = PCA(n_components=1)

# fitting our x training data to pca
Y_sklearn = sklearn_pca.fit_transform(ss)

# adding results to a new data frame
X_PCA = pd.DataFrame(Y_sklearn)

## New Regression Model

In [54]:
#Fitting Model
Lr.fit(X_PCA, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [55]:
# Checking Model Over 10 Folds
cross_val_score(Lr.fit(X_train, y_train), X, y, cv=10)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

### Linear Model Analysis

It's no surprise the linear model can predict the minutes late a flight will be with 100% accuracy.  Since simple regression uses y = mx + b to calculate the points. With this much information there are multiple ways it can caluculate the target variable

# Random Forest Model

In [56]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.grid_search import GridSearchCV



In [57]:
# Calling model
RFR = RandomForestRegressor()

In [58]:
# creating parameters to test
param_grid = {'n_estimators':[10,50,100,150,200,300,400,500]}

In [59]:
# fitting grid with setting
grid = GridSearchCV(RFR,param_grid,verbose=3)

In [62]:
# fitting model with the best params
rfr =RandomForestRegressor(n_estimators=10)

In [63]:
# cross validating new model
cross_val_score(rfr.fit(X_PCA, y_train), X, y, cv=5)

array([ 0.99996907,  0.99996851,  0.9999995 ,  0.99999991,  0.99999994])

### Random Forest Analysis

The random forest model was also able to predict the target variable with nearly 100% accuracy.  This is no surprise since there are multiple decision trees calculating the target variable then sharing their prediction to choose the best outcome.

## K-Nearest Neighbor Model

In [64]:
# lib import
from sklearn.neighbors import KNeighborsRegressor

In [65]:
# setting up model
KNN = KNeighborsRegressor()

In [66]:
# creating list of parameters to test
param_grid = {'n_neighbors':[1,3,5,10,15,20,25,30,40,50,75,100],
              'weights':['uniform','distance']}

In [67]:
# setting grid search with model and params to test
grid = GridSearchCV(KNN,param_grid,verbose=3)

In [69]:
# best set of parameters
grid.best_params_

{'n_neighbors': 100, 'weights': 'uniform'}

In [75]:
# fitting model with best parameters
knn = KNeighborsRegressor(n_neighbors=100, weights='uniform')

In [76]:
# cross validation new model
cross_val_score(knn.fit(X_PCA, y_train), X, y, cv=5)

array([ 0.97970829,  0.980647  ,  0.99947748,  0.99972241,  0.99953048])

### KNN Analysis

The K-Nearest Neighbor model had the lowest accuracy of all with about 99%.  I am kind of surprising it wasn't at 100% also.  Since it finds the most similar data point to predict the target value off of. I would think it could get all the points correct using the closest arrival time and scheduled time.

# Conclusion

In this notebook, we broke down airline data regarding late arrivals.  Using a Linear regression model, random forest and nearest neighbor model.  Unfortunatley, do to the size of the dataset we were unable to an svm model or a gradient boosted model.  Both of those model use distance measurements from every point, so with  seven million data points and multiple categories it adds up fast causing crashes.  On the three models we were able to make we have very good results with 100% accuracy on 2 and 99% on the other. 