# Airline Arrivals

In [0]:
!pip install -U -q PyDrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
data_downloaded = drive.CreateFile({'id':'1EVMLcL5r1Vos8Jlsd2otiEHkcz33OPGI'})
data_downloaded.GetContentFile('data.csv')

In [0]:
#load library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


%matplotlib inline

# set seed for reproducibility
np.random.seed(0)

In [0]:
# read in all our data
airline_arrival = pd.read_csv('data.csv')

In [0]:
pd.set_option('display.max_columns', None)

In [6]:
#Xem 5 dòng dữ liệu đầu tiên
airline_arrival.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,N712SW,128.0,150.0,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,N772SW,128.0,145.0,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,N428WN,96.0,90.0,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,N612SW,88.0,90.0,78.0,-6.0,-4.0,IND,BWI,515,3.0,7.0,0,,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,N464WN,90.0,90.0,77.0,34.0,34.0,IND,BWI,515,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0


In [7]:
#Xem 5 dòng dữ liệu cuối cùng
airline_arrival.tail()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
7009723,2008,12,13,6,1002.0,959,1204.0,1150,DL,1636,N646DL,122.0,111.0,71.0,14.0,3.0,ATL,IAD,533,6.0,45.0,0,,0,,,,,
7009724,2008,12,13,6,834.0,835,1021.0,1023,DL,1637,N908DL,167.0,168.0,139.0,-2.0,-1.0,ATL,SAT,874,5.0,23.0,0,,0,,,,,
7009725,2008,12,13,6,655.0,700,856.0,856,DL,1638,N671DN,121.0,116.0,85.0,0.0,-5.0,PBI,ATL,545,24.0,12.0,0,,0,,,,,
7009726,2008,12,13,6,1251.0,1240,1446.0,1437,DL,1639,N646DL,115.0,117.0,89.0,9.0,11.0,IAD,ATL,533,13.0,13.0,0,,0,,,,,
7009727,2008,12,13,6,1110.0,1103,1413.0,1418,DL,1641,N908DL,123.0,135.0,104.0,-5.0,7.0,SAT,ATL,874,8.0,11.0,0,,0,,,,,


In [8]:
#Xem số dòng, số cột của dataframe
airline_arrival.shape

(7009728, 29)

In [0]:
# airline_arrival_pos = airline_arrival[airline_arrival['Cancelled'] == 1]
# airline_arrival_neutral = airline_arrival[airline_arrival['Cancelled'] == 0].sample(n=1009728, random_state=50)

In [0]:
# airline_arrival_pos.shape

(137434, 29)

In [0]:
# airline_arrival = airline_arrival_pos.append(airline_arrival_neutral);

In [9]:
airline_arrival.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

In [0]:
airline_arrival['target'] = np.where(airline_arrival.eval("ArrDelay >= 30"), 1, 0)

In [11]:
airline_arrival['target'].value_counts()

0    6082313
1     927415
Name: target, dtype: int64

In [12]:
airline_arrival[airline_arrival['ArrDelay']>= 30].shape

(927415, 30)

In [13]:
airline_arrival.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,7009728.0,2008.0,0.0,2008.0,2008.0,2008.0,2008.0,2008.0
Month,7009728.0,6.37513,3.406737,1.0,3.0,6.0,9.0,12.0
DayofMonth,7009728.0,15.728015,8.797068,1.0,8.0,16.0,23.0,31.0
DayOfWeek,7009728.0,3.924182,1.988259,1.0,2.0,4.0,6.0,7.0
DepTime,6873482.0,1333.830046,478.068895,1.0,928.0,1325.0,1728.0,2400.0
CRSDepTime,7009728.0,1326.085663,464.250911,0.0,925.0,1320.0,1715.0,2359.0
ArrTime,6858079.0,1481.258227,505.225129,1.0,1107.0,1512.0,1909.0,2400.0
CRSArrTime,7009728.0,1494.801154,482.672822,0.0,1115.0,1517.0,1907.0,2400.0
FlightNum,7009728.0,2224.200105,1961.715999,1.0,622.0,1571.0,3518.0,9743.0
ActualElapsedTime,6855029.0,127.322424,70.187308,12.0,77.0,110.0,157.0,1379.0


## Data cleaning

In [14]:
# Số lượng điểm data bị thiếu trên mỗi cột
missing_values_count = airline_arrival.isnull().sum()

# Số lượng điểm data bị thiếu trên mỗi cột của 10 cột đầu
missing_values_count

Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime               136246
CRSDepTime                 0
ArrTime               151649
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                83365
ActualElapsedTime     154699
CRSElapsedTime           844
AirTime               154699
ArrDelay              154699
DepDelay              136246
Origin                     0
Dest                       0
Distance                   0
TaxiIn                151649
TaxiOut               137058
Cancelled                  0
CancellationCode     6872294
Diverted                   0
CarrierDelay         5484993
WeatherDelay         5484993
NASDelay             5484993
SecurityDelay        5484993
LateAircraftDelay    5484993
target                     0
dtype: int64

In [0]:
airline_arrival = airline_arrival[airline_arrival['ArrDelay'].notnull()]

In [16]:
# Số lượng điểm data bị thiếu trên mỗi cột
missing_values_count = airline_arrival.isnull().sum()

# Số lượng điểm data bị thiếu trên mỗi cột của 10 cột đầu
missing_values_count

Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime                    0
CRSDepTime                 0
ArrTime                    0
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                    5
ActualElapsedTime          0
CRSElapsedTime             0
AirTime                    0
ArrDelay                   0
DepDelay                   0
Origin                     0
Dest                       0
Distance                   0
TaxiIn                     0
TaxiOut                    0
Cancelled                  0
CancellationCode     6855029
Diverted                   0
CarrierDelay         5330294
WeatherDelay         5330294
NASDelay             5330294
SecurityDelay        5330294
LateAircraftDelay    5330294
target                     0
dtype: int64

## Xử lý Missing Value

In [0]:
# airline_arrival['DepTime'].fillna((int)(airline_arrival['DepTime'].mean()), inplace=True)

In [0]:
# airline_arrival['ArrTime'].fillna((int)(airline_arrival['ArrTime'].mean()), inplace=True)

In [0]:
len(airline_arrival['TailNum'].unique())#.value_counts()

5367

In [17]:
airline_arrival['TailNum'].value_counts().head(1)

N476HA    4701
Name: TailNum, dtype: int64

In [0]:
airline_arrival['TailNum'].fillna('N476HA', inplace=True)

In [0]:
# airline_arrival['ActualElapsedTime'].fillna((int)(airline_arrival['ActualElapsedTime'].mean()), inplace=True)

In [0]:
# airline_arrival['CRSElapsedTime'].fillna((int)(airline_arrival['CRSElapsedTime'].mean()), inplace=True)

In [0]:
# airline_arrival['AirTime'].fillna((int)(airline_arrival['AirTime'].mean()), inplace=True)

In [0]:
# airline_arrival['ArrDelay'].fillna((int)(airline_arrival['ArrDelay'].mean()), inplace=True)

In [0]:
# airline_arrival['DepDelay'].fillna((int)(airline_arrival['DepDelay'].mean()), inplace=True)

In [0]:
# airline_arrival['TaxiIn'].fillna((int)(airline_arrival['TaxiIn'].mean()), inplace=True)

In [0]:
# airline_arrival['TaxiOut'].fillna((int)(airline_arrival['TaxiOut'].mean()), inplace=True)

In [19]:
airline_arrival.CancellationCode.value_counts()

Series([], Name: CancellationCode, dtype: int64)

In [0]:
airline_arrival[airline_arrival['CancellationCode'].isnull()].shape

(6855029, 30)

In [0]:
airline_arrival[airline_arrival['CancellationCode'].notnull()].shape

(0, 30)

In [0]:
airline_arrival.shape

(6855029, 30)

In [0]:
airline_arrival.drop(['CancellationCode'], axis = 1, inplace = True)

In [21]:
airline_arrival[airline_arrival['CarrierDelay'].notnull()].shape

(1524735, 29)

In [0]:
airline_arrival['CarrierDelay'].fillna((int)(airline_arrival['CarrierDelay'].mean()), inplace=True)

In [0]:
airline_arrival[airline_arrival['WeatherDelay'].notnull()].shape
airline_arrival['WeatherDelay'].fillna((int)(airline_arrival['WeatherDelay'].mean()), inplace=True)

In [24]:
airline_arrival[airline_arrival['NASDelay'].notnull()].shape

(1524735, 29)

In [0]:
airline_arrival['NASDelay'].fillna((int)(airline_arrival['NASDelay'].mean()), inplace=True)

In [26]:
airline_arrival[airline_arrival['SecurityDelay'].notnull()].shape

(1524735, 29)

In [0]:
airline_arrival['SecurityDelay'].fillna((int)(airline_arrival['SecurityDelay'].mean()), inplace=True)

In [28]:
airline_arrival[airline_arrival['LateAircraftDelay'].notnull()].shape

(1524735, 29)

In [0]:
airline_arrival['LateAircraftDelay'].fillna((int)(airline_arrival['LateAircraftDelay'].mean()), inplace=True)

In [30]:
# Số lượng điểm data bị thiếu trên mỗi cột
missing_values_count = airline_arrival.isnull().sum()

# Số lượng điểm data bị thiếu trên mỗi cột của 10 cột đầu
missing_values_count

Year                 0
Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
CRSDepTime           0
ArrTime              0
CRSArrTime           0
UniqueCarrier        0
FlightNum            0
TailNum              0
ActualElapsedTime    0
CRSElapsedTime       0
AirTime              0
ArrDelay             0
DepDelay             0
Origin               0
Dest                 0
Distance             0
TaxiIn               0
TaxiOut              0
Cancelled            0
Diverted             0
CarrierDelay         0
WeatherDelay         0
NASDelay             0
SecurityDelay        0
LateAircraftDelay    0
target               0
dtype: int64

In [0]:
airline_arrival.dtypes

Year                   int64
Month                  int64
DayofMonth             int64
DayOfWeek              int64
DepTime              float64
CRSDepTime             int64
ArrTime              float64
CRSArrTime             int64
UniqueCarrier         object
FlightNum              int64
TailNum               object
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin                object
Dest                  object
Distance               int64
TaxiIn               float64
TaxiOut              float64
Cancelled              int64
Diverted               int64
CarrierDelay         float64
WeatherDelay         float64
NASDelay             float64
SecurityDelay        float64
LateAircraftDelay    float64
target                 int32
dtype: object

In [0]:
len(airline_arrival.UniqueCarrier.unique())#value_counts()

20

In [0]:
len(airline_arrival.TailNum.unique())#value_counts()

5366

In [0]:
len(airline_arrival.Origin.unique())#value_counts()

303

In [0]:
len(airline_arrival.Dest.unique())#value_counts()

302

In [0]:
#airline_arrival = pd.get_dummies(airline_arrival, columns=['UniqueCarrier', 'TailNum', 'Origin', 'Dest']);

In [0]:
le = LabelEncoder()

airline_arrival['UniqueCarrier_LabelEncoded'] = le.fit_transform(airline_arrival.UniqueCarrier)

In [0]:
airline_arrival['TailNum_LabelEncoded'] = le.fit_transform(airline_arrival.TailNum)

In [0]:
airline_arrival['Origin_LabelEncoded'] = le.fit_transform(airline_arrival.Origin)

In [0]:
airline_arrival['Dest_LabelEncoded'] = le.fit_transform(airline_arrival.Dest)

In [0]:
airline_arrival.drop(['UniqueCarrier'], axis = 1, inplace = True)
airline_arrival.drop(['TailNum'], axis = 1, inplace = True)
airline_arrival.drop(['Origin'], axis = 1, inplace = True)
airline_arrival.drop(['Dest'], axis = 1, inplace = True)

In [0]:
airline_arrival.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,target,UniqueCarrier_LabelEncoded,TailNum_LabelEncoded,Origin_LabelEncoded,Dest_LabelEncoded
0,2008,1,3,4,2003.0,1955,2211.0,2225,335,128.0,150.0,116.0,-14.0,8.0,810,4.0,8.0,0,0,15.0,3.0,17.0,0.0,20.0,0,17,3761,135,284
1,2008,1,3,4,754.0,735,1002.0,1000,3231,128.0,145.0,113.0,2.0,19.0,810,5.0,10.0,0,0,15.0,3.0,17.0,0.0,20.0,0,17,4121,135,284
2,2008,1,3,4,628.0,620,804.0,750,448,96.0,90.0,76.0,14.0,8.0,515,3.0,17.0,0,0,15.0,3.0,17.0,0.0,20.0,0,17,1953,140,48
3,2008,1,3,4,926.0,930,1054.0,1100,1746,88.0,90.0,78.0,-6.0,-4.0,515,3.0,7.0,0,0,15.0,3.0,17.0,0.0,20.0,0,17,3051,140,48
4,2008,1,3,4,1829.0,1755,1959.0,1925,3920,90.0,90.0,77.0,34.0,34.0,515,3.0,10.0,0,0,2.0,0.0,0.0,0.0,32.0,1,17,2134,140,48


In [0]:
airline_arrival.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'FlightNum', 'ActualElapsedTime',
       'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance',
       'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'target', 'UniqueCarrier_LabelEncoded', 'TailNum_LabelEncoded',
       'Origin_LabelEncoded', 'Dest_LabelEncoded'],
      dtype='object')

### Xử lý kiểu Numberic, đưa về cùng 1 scale (mean=0, std=1)

In [0]:
airline_arrival['FlightNum'].head()

0     335
1    3231
2     448
3    1746
4    3920
Name: FlightNum, dtype: int64

In [0]:
#DepTime, CRSDepTime, ArrTime, CRSArrTime: hhmm -> mm

In [0]:
airline_arrival['DepTime'] = ((airline_arrival['DepTime'] - (airline_arrival['DepTime'] % 100)) / 100) * 60 + (airline_arrival['DepTime'] % 100)

In [0]:
airline_arrival['CRSDepTime'] = ((airline_arrival['CRSDepTime'] - (airline_arrival['CRSDepTime'] % 100)) / 100) * 60 + (airline_arrival['CRSDepTime'] % 100)

In [0]:
airline_arrival['ArrTime'] = ((airline_arrival['ArrTime'] - (airline_arrival['ArrTime'] % 100)) / 100) * 60 + (airline_arrival['ArrTime'] % 100)

In [0]:
airline_arrival['CRSArrTime'] = ((airline_arrival['CRSArrTime'] - (airline_arrival['CRSArrTime'] % 100)) / 100) * 60 + (airline_arrival['CRSArrTime'] % 100)

In [0]:
# airline_arrival['Year'] = (airline_arrival['Year'] - airline_arrival['Year'].mean())/airline_arrival['Year'].std()
airline_arrival['Month'] = (airline_arrival['Month'] - airline_arrival['Month'].mean())/airline_arrival['Month'].std()
airline_arrival['DayofMonth'] = (airline_arrival['DayofMonth'] - airline_arrival['DayofMonth'].mean())/airline_arrival['DayofMonth'].std()
airline_arrival['DayOfWeek'] = (airline_arrival['DayOfWeek'] - airline_arrival['DayOfWeek'].mean())/airline_arrival['DayOfWeek'].std()
airline_arrival['DepTime'] = (airline_arrival['DepTime'] - airline_arrival['DepTime'].mean())/airline_arrival['DepTime'].std()
airline_arrival['CRSDepTime'] = (airline_arrival['CRSDepTime'] - airline_arrival['CRSDepTime'].mean())/airline_arrival['CRSDepTime'].std()
airline_arrival['ArrTime'] = (airline_arrival['ArrTime'] - airline_arrival['ArrTime'].mean())/airline_arrival['ArrTime'].std()
airline_arrival['CRSArrTime'] = (airline_arrival['CRSArrTime'] - airline_arrival['CRSArrTime'].mean())/airline_arrival['CRSArrTime'].std()
airline_arrival['FlightNum'] = (airline_arrival['FlightNum'] - airline_arrival['FlightNum'].mean())/airline_arrival['FlightNum'].std()
airline_arrival['ActualElapsedTime'] = (airline_arrival['ActualElapsedTime'] - airline_arrival['ActualElapsedTime'].mean())/airline_arrival['ActualElapsedTime'].std()
airline_arrival['CRSElapsedTime'] = (airline_arrival['CRSElapsedTime'] - airline_arrival['CRSElapsedTime'].mean())/airline_arrival['CRSElapsedTime'].std()
airline_arrival['AirTime'] = (airline_arrival['AirTime'] - airline_arrival['AirTime'].mean())/airline_arrival['AirTime'].std()
airline_arrival['ArrDelay'] = (airline_arrival['ArrDelay'] - airline_arrival['ArrDelay'].mean())/airline_arrival['ArrDelay'].std()
airline_arrival['DepDelay'] = (airline_arrival['DepDelay'] - airline_arrival['DepDelay'].mean())/airline_arrival['DepDelay'].std()
airline_arrival['Distance'] = (airline_arrival['Distance'] - airline_arrival['Distance'].mean())/airline_arrival['Distance'].std()
airline_arrival['TaxiIn'] = (airline_arrival['TaxiIn'] - airline_arrival['TaxiIn'].mean())/airline_arrival['TaxiIn'].std()
airline_arrival['TaxiOut'] = (airline_arrival['TaxiOut'] - airline_arrival['TaxiOut'].mean())/airline_arrival['TaxiOut'].std()
# airline_arrival['Diverted'] = (airline_arrival['Diverted'] - airline_arrival['Diverted'].mean())/airline_arrival['Diverted'].std()
airline_arrival['CarrierDelay'] = (airline_arrival['CarrierDelay'] - airline_arrival['CarrierDelay'].mean())/airline_arrival['CarrierDelay'].std()
airline_arrival['WeatherDelay'] = (airline_arrival['WeatherDelay'] - airline_arrival['WeatherDelay'].mean())/airline_arrival['WeatherDelay'].std()
airline_arrival['NASDelay'] = (airline_arrival['NASDelay'] - airline_arrival['NASDelay'].mean())/airline_arrival['NASDelay'].std()
airline_arrival['SecurityDelay'] = (airline_arrival['SecurityDelay'] - airline_arrival['SecurityDelay'].mean())/airline_arrival['SecurityDelay'].std()
airline_arrival['LateAircraftDelay'] = (airline_arrival['LateAircraftDelay'] - airline_arrival['LateAircraftDelay'].mean())/airline_arrival['LateAircraftDelay'].std()
# airline_arrival['UniqueCarrier_LabelEncoded'] = (airline_arrival['UniqueCarrier_LabelEncoded'] - airline_arrival['UniqueCarrier_LabelEncoded'].mean())/airline_arrival['UniqueCarrier_LabelEncoded'].std()
# airline_arrival['TailNum_LabelEncoded'] = (airline_arrival['TailNum_LabelEncoded'] - airline_arrival['TailNum_LabelEncoded'].mean())/airline_arrival['TailNum_LabelEncoded'].std()
# airline_arrival['Origin_LabelEncoded'] = (airline_arrival['Origin_LabelEncoded'] - airline_arrival['Origin_LabelEncoded'].mean())/airline_arrival['Origin_LabelEncoded'].std()
# airline_arrival['Dest_LabelEncoded'] = (airline_arrival['Dest_LabelEncoded'] - airline_arrival['Dest_LabelEncoded'].mean())/airline_arrival['Dest_LabelEncoded'].std()

In [0]:
# len(airline_arrival.Cancelled.unique())

2

In [0]:
# airline_arrival.Cancelled.value_counts()

0    1009728
1     137434
Name: Cancelled, dtype: int64

In [0]:
# fig = plt.figure(figsize=(8,6))
# airline_arrival.groupby('Cancelled').Year.count().plot.bar(ylim=0)
# plt.show()

In [0]:
airline_arrival = airline_arrival.sample(n=5000, random_state=50)

## Model

In [0]:
feature_columns = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'FlightNum', 'ActualElapsedTime',
       'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance',
       'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'UniqueCarrier_LabelEncoded', 'TailNum_LabelEncoded',
       'Origin_LabelEncoded', 'Dest_LabelEncoded']
X = airline_arrival[feature_columns]
y = airline_arrival['target']

In [0]:
X.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,UniqueCarrier_LabelEncoded,TailNum_LabelEncoded,Origin_LabelEncoded,Dest_LabelEncoded
6050882,1.356782,-1.447351,-1.471391,-1.580854,-1.595727,-1.423156,-1.379646,0.049321,0.608053,1.135037,0.770782,-1.225093,-0.310302,0.689321,-0.782798,-0.48129,0,0,-0.009079,-0.000944,-0.002434,-0.019226,-0.00926,18,611,224,136
6337246,1.356782,-0.197505,0.540216,-0.440286,-0.414204,-0.529423,-0.544573,0.015075,-1.044668,-0.993458,-0.964098,-0.471884,-0.39552,-0.967153,-0.174429,-0.658115,0,0,-0.009079,-0.000944,-0.002434,-0.019226,-0.00926,0,119,133,181
2321555,-0.702748,0.597851,-1.471391,0.916536,0.982143,0.793038,0.787402,-0.953012,-0.873697,-1.050985,-0.993754,-0.082293,-0.423926,-0.896136,-0.580008,0.756485,0,0,-0.009079,-0.000944,-0.002434,-0.019226,-0.00926,3,2663,258,111
6057490,1.356782,-0.879239,1.043118,-1.577366,-1.595727,-1.809011,-1.88345,2.52525,-1.073163,-0.964694,-0.94927,-0.471884,-0.281896,-1.01509,-0.782798,-0.658115,0,0,-0.009079,-0.000944,-0.002434,-0.019226,-0.00926,19,1556,109,210
2682393,-0.40853,1.620452,0.540216,-0.621661,-0.593222,-0.470061,-0.554925,-1.128842,1.662374,1.408289,1.452872,0.073543,-0.452332,1.262784,0.02836,1.64061,0,0,-0.009079,-0.000944,-0.002434,-0.019226,-0.00926,16,4392,210,220


In [43]:
y.value_counts()

0    4335
1     665
Name: target, dtype: int64

# LogisticRegression

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
#We call our estimator instance clf, as it is a classifier.
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
# MultinomialNB_accuracy = accuracy_score(y_test, y_pred)*100

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1099
          1       0.99      0.97      0.98       151

avg / total       1.00      1.00      1.00      1250



In [0]:
# Create logistic regression
logistic = LogisticRegression()

In [0]:
# Create Hyperparameter Search Space
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [0]:
# Create Grid Search
# Create grid search using 5-fold cross validation
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

In [0]:
# Fit grid search
best_model = clf.fit(X_train, y_train)

In [49]:
# View Hyperparameter Values Of Best Model
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

Best Penalty: l1
Best C: 1291.5496650148827


In [0]:
# Predict Using Best Model
# Predict target vector
y_pred = best_model.predict(X_test)

In [51]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1099
          1       0.99      1.00      1.00       151

avg / total       1.00      1.00      1.00      1250



# SVM

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
#We call our estimator instance clf, as it is a classifier.
clf = SVC().fit(X_train, y_train)
y_pred = clf.predict(X_test)
# MultinomialNB_accuracy = accuracy_score(y_test, y_pred)*100

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.88      1.00      0.94      1099
          1       0.00      0.00      0.00       151

avg / total       0.77      0.88      0.82      1250



  'precision', 'predicted', average, warn_for)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(SVC(), tuned_parameters, cv=5)
# Fit grid search
best_model = clf.fit(X_train, y_train)

print(clf.best_params_)

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))


KeyboardInterrupt: ignored

# Random Forest Classifier

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
#We call our estimator instance clf, as it is a classifier.
clf = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
# MultinomialNB_accuracy = accuracy_score(y_test, y_pred)*100

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    252736
          1       1.00      1.00      1.00     34055

avg / total       1.00      1.00      1.00    286791



In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
best_rfc_model = CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_)

y_pred = best_rfc_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Decision Tree

In [0]:

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
#We call our estimator instance clf, as it is a classifier.
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# MultinomialNB_accuracy = accuracy_score(y_test, y_pred)*100

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1099
          1       1.00      1.00      1.00       151

avg / total       1.00      1.00      1.00      1250



In [0]:
from sklearn.model_selection import StratifiedKFold
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
decision_tree_classifier = tree.DecisionTreeClassifier(random_state=0)

parameter_grid = {'max_depth': [1, 2, 3, 4, 5],
                  'max_features': [1, 2, 3, 4]}

cross_validation = StratifiedKFold(n_splits=2, random_state=None, shuffle=False)

grid_search_decision_tree = GridSearchCV(decision_tree_classifier, param_grid = parameter_grid,
                          cv = cross_validation)

best_decision_tree_model = grid_search_decision_tree.fit(X_train, y_train)

print("Best Score: {}".format(grid_search_decision_tree.best_score_))
print("Best params: {}".format(grid_search_decision_tree.best_params_))

y_pred = best_decision_tree_model.predict(X_test)
print(classification_report(y_test, y_pred))

Best Score: 0.9818666666666667
Best params: {'max_depth': 5, 'max_features': 4}
             precision    recall  f1-score   support

          0       0.98      0.98      0.98      1099
          1       0.85      0.84      0.85       151

avg / total       0.96      0.96      0.96      1250



# GaussianNB

In [0]:
from sklearn.naive_bayes import BernoulliNB
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
#We call our estimator instance clf, as it is a classifier.
clf = BernoulliNB().fit(X_train, y_train)

y_pred = clf.predict(X_test)
# MultinomialNB_accuracy = accuracy_score(y_test, y_pred)*100

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.99      0.92      0.95      1099
          1       0.60      0.94      0.74       151

avg / total       0.94      0.92      0.93      1250



In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
BernoulliNB_classifier = BernoulliNB()

parameters = {'alpha':[1, 10, 100]}

grid_search_BernoulliNB = GridSearchCV(BernoulliNB_classifier, parameters)

best_BernoulliNB_model = grid_search_BernoulliNB.fit(X_train, y_train)

print("Best Score: {}".format(grid_search_BernoulliNB.best_score_))
print("Best params: {}".format(grid_search_BernoulliNB.best_params_))

y_pred = best_BernoulliNB_model.predict(X_test)
print(classification_report(y_test, y_pred))

Best Score: 0.9434666666666667
Best params: {'alpha': 100}
             precision    recall  f1-score   support

          0       0.98      0.96      0.97      1099
          1       0.72      0.85      0.78       151

avg / total       0.95      0.94      0.94      1250



In [0]:
# GradientBoostingClassifier

In [60]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
# learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.25, max_features=2, max_depth = 2, random_state = 0)
gb_clf.fit(X_train, y_train)
y_pred = gb_clf.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[1098    1]
 [   5  146]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1099
          1       0.99      0.97      0.98       151

avg / total       1.00      1.00      1.00      1250



In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
param_test1 = {'n_estimators':range(20,51,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

best_GradientBoostingClassifier_model = gsearch1.fit(X_train,y_train)
y_pred = best_GradientBoostingClassifier_model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[1099    0]
 [   0  151]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1099
          1       1.00      1.00      1.00       151

avg / total       1.00      1.00      1.00      1250

