# Airline Arrivals

In [0]:
!pip install -U -q PyDrive
 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
data_downloaded = drive.CreateFile({'id':'1EVMLcL5r1Vos8Jlsd2otiEHkcz33OPGI'})
data_downloaded.GetContentFile('data.csv')

In [0]:
#load library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, f_classif

from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, StratifiedKFold


%matplotlib inline

# set seed for reproducibility
np.random.seed(0)

In [0]:
# read in all our data
airline_arrival = pd.read_csv('data.csv')

In [0]:
pd.set_option('display.max_columns', None)

In [6]:
#Xem 5 dòng dữ liệu đầu tiên
airline_arrival.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,N712SW,128.0,150.0,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,N772SW,128.0,145.0,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,N428WN,96.0,90.0,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,N612SW,88.0,90.0,78.0,-6.0,-4.0,IND,BWI,515,3.0,7.0,0,,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,N464WN,90.0,90.0,77.0,34.0,34.0,IND,BWI,515,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0


In [7]:
#Xem 5 dòng dữ liệu cuối cùng
airline_arrival.tail()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
7009723,2008,12,13,6,1002.0,959,1204.0,1150,DL,1636,N646DL,122.0,111.0,71.0,14.0,3.0,ATL,IAD,533,6.0,45.0,0,,0,,,,,
7009724,2008,12,13,6,834.0,835,1021.0,1023,DL,1637,N908DL,167.0,168.0,139.0,-2.0,-1.0,ATL,SAT,874,5.0,23.0,0,,0,,,,,
7009725,2008,12,13,6,655.0,700,856.0,856,DL,1638,N671DN,121.0,116.0,85.0,0.0,-5.0,PBI,ATL,545,24.0,12.0,0,,0,,,,,
7009726,2008,12,13,6,1251.0,1240,1446.0,1437,DL,1639,N646DL,115.0,117.0,89.0,9.0,11.0,IAD,ATL,533,13.0,13.0,0,,0,,,,,
7009727,2008,12,13,6,1110.0,1103,1413.0,1418,DL,1641,N908DL,123.0,135.0,104.0,-5.0,7.0,SAT,ATL,874,8.0,11.0,0,,0,,,,,


In [8]:
#Xem số dòng, số cột của dataframe
airline_arrival.shape

(7009728, 29)

In [9]:
airline_arrival.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

### target value

In [0]:
airline_arrival['target'] = np.where(airline_arrival.eval("ArrDelay >= 30"), 1, 0)

In [11]:
airline_arrival['target'].value_counts()

0    6082313
1     927415
Name: target, dtype: int64

In [12]:
airline_arrival[airline_arrival['ArrDelay']>= 30].shape

(927415, 30)

In [13]:
airline_arrival.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,7009728.0,2008.0,0.0,2008.0,2008.0,2008.0,2008.0,2008.0
Month,7009728.0,6.37513,3.406737,1.0,3.0,6.0,9.0,12.0
DayofMonth,7009728.0,15.728015,8.797068,1.0,8.0,16.0,23.0,31.0
DayOfWeek,7009728.0,3.924182,1.988259,1.0,2.0,4.0,6.0,7.0
DepTime,6873482.0,1333.830046,478.068895,1.0,928.0,1325.0,1728.0,2400.0
CRSDepTime,7009728.0,1326.085663,464.250911,0.0,925.0,1320.0,1715.0,2359.0
ArrTime,6858079.0,1481.258227,505.225129,1.0,1107.0,1512.0,1909.0,2400.0
CRSArrTime,7009728.0,1494.801154,482.672822,0.0,1115.0,1517.0,1907.0,2400.0
FlightNum,7009728.0,2224.200105,1961.715999,1.0,622.0,1571.0,3518.0,9743.0
ActualElapsedTime,6855029.0,127.322424,70.187308,12.0,77.0,110.0,157.0,1379.0


## Data cleaning

In [14]:
# Số lượng điểm data bị thiếu trên mỗi cột
missing_values_count = airline_arrival.isnull().sum()

# Số lượng điểm data bị thiếu trên mỗi cột của 10 cột đầu
missing_values_count

Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime               136246
CRSDepTime                 0
ArrTime               151649
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                83365
ActualElapsedTime     154699
CRSElapsedTime           844
AirTime               154699
ArrDelay              154699
DepDelay              136246
Origin                     0
Dest                       0
Distance                   0
TaxiIn                151649
TaxiOut               137058
Cancelled                  0
CancellationCode     6872294
Diverted                   0
CarrierDelay         5484993
WeatherDelay         5484993
NASDelay             5484993
SecurityDelay        5484993
LateAircraftDelay    5484993
target                     0
dtype: int64

In [0]:
airline_arrival = airline_arrival[airline_arrival['ArrDelay'].notnull()]

In [16]:
# Số lượng điểm data bị thiếu trên mỗi cột
missing_values_count = airline_arrival.isnull().sum()

# Số lượng điểm data bị thiếu trên mỗi cột của 10 cột đầu
missing_values_count

Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime                    0
CRSDepTime                 0
ArrTime                    0
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                    5
ActualElapsedTime          0
CRSElapsedTime             0
AirTime                    0
ArrDelay                   0
DepDelay                   0
Origin                     0
Dest                       0
Distance                   0
TaxiIn                     0
TaxiOut                    0
Cancelled                  0
CancellationCode     6855029
Diverted                   0
CarrierDelay         5330294
WeatherDelay         5330294
NASDelay             5330294
SecurityDelay        5330294
LateAircraftDelay    5330294
target                     0
dtype: int64

## Xử lý Missing Value

In [17]:
len(airline_arrival['TailNum'].unique())#.value_counts()

5367

In [18]:
airline_arrival['TailNum'].value_counts().head(5)

N476HA    4701
N477HA    4546
N484HA    4504
N475HA    4496
N480HA    4416
Name: TailNum, dtype: int64

In [0]:
airline_arrival['TailNum'].fillna('N476HA', inplace=True)

In [20]:
airline_arrival[airline_arrival['CancellationCode'].isnull()].shape

(6855029, 30)

In [21]:
airline_arrival[airline_arrival['CancellationCode'].notnull()].shape

(0, 30)

In [0]:
airline_arrival.drop(['CancellationCode'], axis = 1, inplace = True)

In [23]:
airline_arrival[airline_arrival['CarrierDelay'].notnull()].shape

(1524735, 29)

In [0]:
airline_arrival['CarrierDelay'].fillna((int)(airline_arrival['CarrierDelay'].mean()), inplace=True)

In [25]:
airline_arrival[airline_arrival['WeatherDelay'].notnull()].shape

(1524735, 29)

In [0]:
airline_arrival['WeatherDelay'].fillna((int)(airline_arrival['WeatherDelay'].mean()), inplace=True)

In [27]:
airline_arrival[airline_arrival['NASDelay'].notnull()].shape

(1524735, 29)

In [0]:
airline_arrival['NASDelay'].fillna((int)(airline_arrival['NASDelay'].mean()), inplace=True)

In [29]:
airline_arrival[airline_arrival['SecurityDelay'].notnull()].shape

(1524735, 29)

In [0]:
airline_arrival['SecurityDelay'].fillna((int)(airline_arrival['SecurityDelay'].mean()), inplace=True)

In [31]:
airline_arrival[airline_arrival['LateAircraftDelay'].notnull()].shape

(1524735, 29)

In [0]:
airline_arrival['LateAircraftDelay'].fillna((int)(airline_arrival['LateAircraftDelay'].mean()), inplace=True)

In [33]:
# Số lượng điểm data bị thiếu trên mỗi cột
missing_values_count = airline_arrival.isnull().sum()

# Số lượng điểm data bị thiếu trên mỗi cột của 10 cột đầu
missing_values_count

Year                 0
Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
CRSDepTime           0
ArrTime              0
CRSArrTime           0
UniqueCarrier        0
FlightNum            0
TailNum              0
ActualElapsedTime    0
CRSElapsedTime       0
AirTime              0
ArrDelay             0
DepDelay             0
Origin               0
Dest                 0
Distance             0
TaxiIn               0
TaxiOut              0
Cancelled            0
Diverted             0
CarrierDelay         0
WeatherDelay         0
NASDelay             0
SecurityDelay        0
LateAircraftDelay    0
target               0
dtype: int64

In [34]:
len(airline_arrival.UniqueCarrier.unique())#value_counts()

20

In [35]:
len(airline_arrival.TailNum.unique())#value_counts()

5366

In [36]:
len(airline_arrival.Origin.unique())#value_counts()

303

In [37]:
len(airline_arrival.Dest.unique())#value_counts()

302

In [0]:
le = LabelEncoder()
airline_arrival['TailNum_LabelEncoded'] = le.fit_transform(airline_arrival.TailNum)

In [0]:
airline_arrival.drop(['TailNum'], axis = 1, inplace = True)

In [40]:
airline_arrival.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,target,TailNum_LabelEncoded
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,128.0,150.0,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,0,15.0,3.0,17.0,0.0,20.0,0,3761
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,128.0,145.0,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,0,15.0,3.0,17.0,0.0,20.0,0,4121
2,2008,1,3,4,628.0,620,804.0,750,WN,448,96.0,90.0,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,0,15.0,3.0,17.0,0.0,20.0,0,1953
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,88.0,90.0,78.0,-6.0,-4.0,IND,BWI,515,3.0,7.0,0,0,15.0,3.0,17.0,0.0,20.0,0,3051
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,90.0,90.0,77.0,34.0,34.0,IND,BWI,515,3.0,10.0,0,0,2.0,0.0,0.0,0.0,32.0,1,2134


In [41]:
airline_arrival.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay',
       'SecurityDelay', 'LateAircraftDelay', 'target', 'TailNum_LabelEncoded'],
      dtype='object')

In [0]:
### DepTime, CRSDepTime, ArrTime, CRSArrTime có format hhmm chuyển về mm

In [0]:
airline_arrival['DepTime'] = ((airline_arrival['DepTime'] - (airline_arrival['DepTime'] % 100)) / 100) * 60 + (airline_arrival['DepTime'] % 100)

In [0]:
airline_arrival['CRSDepTime'] = ((airline_arrival['CRSDepTime'] - (airline_arrival['CRSDepTime'] % 100)) / 100) * 60 + (airline_arrival['CRSDepTime'] % 100)

In [0]:
airline_arrival['ArrTime'] = ((airline_arrival['ArrTime'] - (airline_arrival['ArrTime'] % 100)) / 100) * 60 + (airline_arrival['ArrTime'] % 100)

In [0]:
airline_arrival['CRSArrTime'] = ((airline_arrival['CRSArrTime'] - (airline_arrival['CRSArrTime'] % 100)) / 100) * 60 + (airline_arrival['CRSArrTime'] % 100)

### Xử lý kiểu Numberic, đưa về cùng 1 scale (mean=0, std=1)

In [0]:
# airline_arrival['Year'] = (airline_arrival['Year'] - airline_arrival['Year'].mean())/airline_arrival['Year'].std()
airline_arrival['Month'] = (airline_arrival['Month'] - airline_arrival['Month'].mean())/airline_arrival['Month'].std()
airline_arrival['DayofMonth'] = (airline_arrival['DayofMonth'] - airline_arrival['DayofMonth'].mean())/airline_arrival['DayofMonth'].std()
airline_arrival['DayOfWeek'] = (airline_arrival['DayOfWeek'] - airline_arrival['DayOfWeek'].mean())/airline_arrival['DayOfWeek'].std()
airline_arrival['DepTime'] = (airline_arrival['DepTime'] - airline_arrival['DepTime'].mean())/airline_arrival['DepTime'].std()
airline_arrival['CRSDepTime'] = (airline_arrival['CRSDepTime'] - airline_arrival['CRSDepTime'].mean())/airline_arrival['CRSDepTime'].std()
airline_arrival['ArrTime'] = (airline_arrival['ArrTime'] - airline_arrival['ArrTime'].mean())/airline_arrival['ArrTime'].std()
airline_arrival['CRSArrTime'] = (airline_arrival['CRSArrTime'] - airline_arrival['CRSArrTime'].mean())/airline_arrival['CRSArrTime'].std()
airline_arrival['FlightNum'] = (airline_arrival['FlightNum'] - airline_arrival['FlightNum'].mean())/airline_arrival['FlightNum'].std()
airline_arrival['ActualElapsedTime'] = (airline_arrival['ActualElapsedTime'] - airline_arrival['ActualElapsedTime'].mean())/airline_arrival['ActualElapsedTime'].std()
airline_arrival['CRSElapsedTime'] = (airline_arrival['CRSElapsedTime'] - airline_arrival['CRSElapsedTime'].mean())/airline_arrival['CRSElapsedTime'].std()
airline_arrival['AirTime'] = (airline_arrival['AirTime'] - airline_arrival['AirTime'].mean())/airline_arrival['AirTime'].std()
airline_arrival['ArrDelay'] = (airline_arrival['ArrDelay'] - airline_arrival['ArrDelay'].mean())/airline_arrival['ArrDelay'].std()
airline_arrival['DepDelay'] = (airline_arrival['DepDelay'] - airline_arrival['DepDelay'].mean())/airline_arrival['DepDelay'].std()
airline_arrival['Distance'] = (airline_arrival['Distance'] - airline_arrival['Distance'].mean())/airline_arrival['Distance'].std()
airline_arrival['TaxiIn'] = (airline_arrival['TaxiIn'] - airline_arrival['TaxiIn'].mean())/airline_arrival['TaxiIn'].std()
airline_arrival['TaxiOut'] = (airline_arrival['TaxiOut'] - airline_arrival['TaxiOut'].mean())/airline_arrival['TaxiOut'].std()
# airline_arrival['Diverted'] = (airline_arrival['Diverted'] - airline_arrival['Diverted'].mean())/airline_arrival['Diverted'].std()
airline_arrival['CarrierDelay'] = (airline_arrival['CarrierDelay'] - airline_arrival['CarrierDelay'].mean())/airline_arrival['CarrierDelay'].std()
airline_arrival['WeatherDelay'] = (airline_arrival['WeatherDelay'] - airline_arrival['WeatherDelay'].mean())/airline_arrival['WeatherDelay'].std()
airline_arrival['NASDelay'] = (airline_arrival['NASDelay'] - airline_arrival['NASDelay'].mean())/airline_arrival['NASDelay'].std()
airline_arrival['SecurityDelay'] = (airline_arrival['SecurityDelay'] - airline_arrival['SecurityDelay'].mean())/airline_arrival['SecurityDelay'].std()
airline_arrival['LateAircraftDelay'] = (airline_arrival['LateAircraftDelay'] - airline_arrival['LateAircraftDelay'].mean())/airline_arrival['LateAircraftDelay'].std()
# airline_arrival['UniqueCarrier_LabelEncoded'] = (airline_arrival['UniqueCarrier_LabelEncoded'] - airline_arrival['UniqueCarrier_LabelEncoded'].mean())/airline_arrival['UniqueCarrier_LabelEncoded'].std()
# airline_arrival['TailNum_LabelEncoded'] = (airline_arrival['TailNum_LabelEncoded'] - airline_arrival['TailNum_LabelEncoded'].mean())/airline_arrival['TailNum_LabelEncoded'].std()
# airline_arrival['Origin_LabelEncoded'] = (airline_arrival['Origin_LabelEncoded'] - airline_arrival['Origin_LabelEncoded'].mean())/airline_arrival['Origin_LabelEncoded'].std()
# airline_arrival['Dest_LabelEncoded'] = (airline_arrival['Dest_LabelEncoded'] - airline_arrival['Dest_LabelEncoded'].mean())/airline_arrival['Dest_LabelEncoded'].std()

In [48]:
airline_arrival.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay',
       'SecurityDelay', 'LateAircraftDelay', 'target', 'TailNum_LabelEncoded'],
      dtype='object')

# Chọn số sample để train

In [0]:
airline_arrival = airline_arrival.sample(n=5000, random_state=50)

In [0]:
feature_columns = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'ActualElapsedTime',
       'CRSElapsedTime', 'AirTime', 'Origin', 'Dest', 'Distance',
       'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'TailNum_LabelEncoded']
#'ArrDelay', 'DepDelay', 
X = airline_arrival[feature_columns]
y = airline_arrival['target']

In [0]:
X = pd.get_dummies(X, columns=['UniqueCarrier', 'Origin', 'Dest']);

In [52]:
X.shape

(5000, 505)

In [0]:
pca = PCA(n_components=400)
principalComponents = pca.fit_transform(X)

name_components = []
for i in range(400):
    name_components.append('principal component ' + str(i + 1))

X_pca = pd.DataFrame(data = principalComponents, columns = name_components)

In [54]:
selector = SelectKBest(f_classif, k=200)
selector.fit(X, y)

X_SelectKBest = selector.transform(X)
X_SelectKBest.shape

  f = msb / msw


(5000, 200)

In [55]:
X.shape

(5000, 505)

In [56]:
X_pca.shape

(5000, 400)

In [57]:
X.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,TailNum_LabelEncoded,UniqueCarrier_9E,UniqueCarrier_AA,UniqueCarrier_AQ,UniqueCarrier_AS,UniqueCarrier_B6,UniqueCarrier_CO,UniqueCarrier_DL,UniqueCarrier_EV,UniqueCarrier_F9,UniqueCarrier_FL,UniqueCarrier_HA,UniqueCarrier_MQ,UniqueCarrier_NW,UniqueCarrier_OH,UniqueCarrier_OO,UniqueCarrier_UA,UniqueCarrier_US,UniqueCarrier_WN,UniqueCarrier_XE,UniqueCarrier_YV,Origin_ABE,Origin_ABI,Origin_ABQ,Origin_ACT,Origin_ACV,Origin_ADQ,Origin_AEX,Origin_AGS,Origin_ALB,Origin_ALO,Origin_AMA,Origin_ANC,Origin_ASE,Origin_ATL,Origin_ATW,Origin_AUS,Origin_AVL,Origin_AVP,Origin_BDL,Origin_BFL,Origin_BGM,Origin_BHM,Origin_BIL,Origin_BIS,Origin_BMI,Origin_BNA,Origin_BOI,Origin_BOS,Origin_BRO,Origin_BTM,Origin_BTR,Origin_BTV,Origin_BUF,Origin_BUR,Origin_BWI,Origin_BZN,Origin_CAE,Origin_CAK,Origin_CEC,Origin_CHA,Origin_CHS,Origin_CID,Origin_CLD,Origin_CLE,Origin_CLT,Origin_CMH,Origin_CMI,Origin_COS,Origin_CPR,Origin_CRP,Origin_CRW,Origin_CSG,Origin_CVG,Origin_DAB,Origin_DAL,Origin_DAY,Origin_DBQ,Origin_DCA,Origin_DEN,Origin_DFW,Origin_DRO,Origin_DSM,Origin_DTW,Origin_EGE,Origin_EKO,Origin_ELM,Origin_ELP,Origin_EUG,Origin_EVV,Origin_EWR,Origin_EYW,Origin_FAI,Origin_FAR,Origin_FAT,Origin_FCA,Origin_FLL,Origin_FLO,Origin_FNT,Origin_FSD,Origin_FWA,Origin_GCC,Origin_GEG,Origin_GFK,Origin_GGG,Origin_GJT,Origin_GNV,Origin_GPT,Origin_GRB,Origin_GRK,Origin_GRR,Origin_GSO,Origin_GSP,Origin_GST,Origin_GTF,Origin_HDN,Origin_HNL,Origin_HOU,Origin_HPN,Origin_HRL,Origin_HSV,Origin_IAD,Origin_IAH,Origin_ICT,Origin_IDA,Origin_ILM,Origin_IND,Origin_IPL,Origin_ISP,Origin_ITO,Origin_IYK,Origin_JAC,Origin_JAN,Origin_JAX,Origin_JFK,Origin_JNU,Origin_KOA,Origin_KTN,Origin_LAN,Origin_LAS,Origin_LAW,Origin_LAX,Origin_LBB,Origin_LCH,Origin_LEX,Origin_LFT,Origin_LGA,Origin_LGB,Origin_LIH,Origin_LIT,Origin_LRD,Origin_LWS,Origin_MAF,Origin_MBS,Origin_MCI,Origin_MCO,Origin_MDT,Origin_MDW,Origin_MEM,Origin_MFE,Origin_MFR,Origin_MGM,Origin_MHT,Origin_MIA,Origin_MKE,Origin_MLB,Origin_MLI,Origin_MLU,Origin_MOB,Origin_MOD,Origin_MRY,Origin_MSN,Origin_MSO,Origin_MSP,Origin_MSY,Origin_MTJ,Origin_MYR,Origin_OAJ,Origin_OAK,Origin_OGG,Origin_OKC,Origin_OMA,Origin_OME,Origin_ONT,Origin_ORD,Origin_ORF,Origin_OTH,Origin_OXR,Origin_PBI,Origin_PDX,Origin_PFN,Origin_PHL,Origin_PHX,Origin_PIA,Origin_PIH,Origin_PIT,Origin_PLN,Origin_PNS,Origin_PSC,Origin_PSG,Origin_PSP,Origin_PVD,Origin_PWM,Origin_RAP,Origin_RDD,Origin_RDM,Origin_RDU,Origin_RIC,Origin_RNO,Origin_ROA,Origin_ROC,Origin_ROW,Origin_RST,Origin_RSW,Origin_SAN,Origin_SAT,Origin_SAV,Origin_SBA,Origin_SBN,Origin_SBP,Origin_SDF,Origin_SEA,Origin_SFO,Origin_SGF,Origin_SGU,Origin_SHV,Origin_SIT,Origin_SJC,Origin_SJU,Origin_SLC,Origin_SMF,Origin_SNA,Origin_SPS,Origin_SRQ,Origin_STL,Origin_STT,Origin_SUN,Origin_SWF,Origin_SYR,Origin_TLH,Origin_TOL,Origin_TPA,Origin_TRI,Origin_TUL,Origin_TUS,Origin_TVC,Origin_TYR,Origin_TYS,Origin_VPS,Origin_XNA,Origin_YKM,Origin_YUM,Dest_ABE,Dest_ABI,Dest_ABQ,Dest_ABY,Dest_ACT,Dest_ACV,Dest_ADQ,Dest_AEX,Dest_AGS,Dest_ALB,Dest_AMA,Dest_ANC,Dest_ASE,Dest_ATL,Dest_ATW,Dest_AUS,Dest_AVL,Dest_AVP,Dest_AZO,Dest_BDL,Dest_BET,Dest_BFL,Dest_BGR,Dest_BHM,Dest_BIL,Dest_BIS,Dest_BMI,Dest_BNA,Dest_BOI,Dest_BOS,Dest_BQK,Dest_BQN,Dest_BRO,Dest_BTR,Dest_BTV,Dest_BUF,Dest_BUR,Dest_BWI,Dest_BZN,Dest_CAE,Dest_CAK,Dest_CHA,Dest_CHS,Dest_CID,Dest_CLD,Dest_CLE,Dest_CLT,Dest_CMH,Dest_CMI,Dest_COS,Dest_CPR,Dest_CRP,Dest_CRW,Dest_CVG,Dest_CWA,Dest_DAL,Dest_DAY,Dest_DCA,Dest_DEN,Dest_DFW,Dest_DHN,Dest_DLH,Dest_DRO,Dest_DSM,Dest_DTW,Dest_EGE,Dest_ELM,Dest_ELP,Dest_EUG,Dest_EVV,Dest_EWR,Dest_FAI,Dest_FAR,Dest_FAT,Dest_FAY,Dest_FCA,Dest_FLL,Dest_FLO,Dest_FNT,Dest_FSD,Dest_FSM,Dest_FWA,Dest_GEG,Dest_GFK,Dest_GJT,Dest_GNV,Dest_GPT,Dest_GRB,Dest_GRK,Dest_GRR,Dest_GSO,Dest_GSP,Dest_GTF,Dest_HDN,Dest_HHH,Dest_HNL,Dest_HOU,Dest_HPN,Dest_HRL,Dest_HSV,Dest_IAD,Dest_IAH,Dest_ICT,Dest_IDA,Dest_ILM,Dest_IND,Dest_IPL,Dest_ISP,Dest_ITO,Dest_JAC,Dest_JAN,Dest_JAX,Dest_JFK,Dest_JNU,Dest_KOA,Dest_KTN,Dest_LAN,Dest_LAS,Dest_LAW,Dest_LAX,Dest_LBB,Dest_LEX,Dest_LFT,Dest_LGA,Dest_LGB,Dest_LIH,Dest_LIT,Dest_LNK,Dest_LRD,Dest_LSE,Dest_MAF,Dest_MBS,Dest_MCI,Dest_MCN,Dest_MCO,Dest_MDT,Dest_MDW,Dest_MEM,Dest_MFE,Dest_MFR,Dest_MGM,Dest_MHT,Dest_MIA,Dest_MKE,Dest_MKG,Dest_MLI,Dest_MLU,Dest_MOB,Dest_MOD,Dest_MOT,Dest_MRY,Dest_MSN,Dest_MSO,Dest_MSP,Dest_MSY,Dest_MYR,Dest_OAJ,Dest_OAK,Dest_OGG,Dest_OKC,Dest_OMA,Dest_ONT,Dest_ORD,Dest_ORF,Dest_OTZ,Dest_OXR,Dest_PBI,Dest_PDX,Dest_PFN,Dest_PHF,Dest_PHL,Dest_PHX,Dest_PIA,Dest_PIH,Dest_PIT,Dest_PMD,Dest_PNS,Dest_PSC,Dest_PSP,Dest_PVD,Dest_PWM,Dest_RAP,Dest_RDM,Dest_RDU,Dest_RFD,Dest_RIC,Dest_RKS,Dest_RNO,Dest_ROA,Dest_ROC,Dest_RST,Dest_RSW,Dest_SAN,Dest_SAT,Dest_SAV,Dest_SBA,Dest_SBN,Dest_SBP,Dest_SCC,Dest_SDF,Dest_SEA,Dest_SFO,Dest_SGF,Dest_SGU,Dest_SHV,Dest_SIT,Dest_SJC,Dest_SJU,Dest_SLC,Dest_SMF,Dest_SMX,Dest_SNA,Dest_SPI,Dest_SRQ,Dest_STL,Dest_STT,Dest_SWF,Dest_SYR,Dest_TLH,Dest_TOL,Dest_TPA,Dest_TRI,Dest_TUL,Dest_TUS,Dest_TVC,Dest_TWF,Dest_TYR,Dest_TYS,Dest_VPS,Dest_WRG,Dest_XNA,Dest_YUM
6050882,1.356782,-1.447351,-1.471391,-1.580854,-1.595727,-1.423156,-1.379646,0.049321,0.608053,1.135037,0.770782,0.689321,-0.782798,-0.48129,0,0,-0.009079,-0.000944,-0.002434,-0.019226,-0.00926,611,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6337246,1.356782,-0.197505,0.540216,-0.440286,-0.414204,-0.529423,-0.544573,0.015075,-1.044668,-0.993458,-0.964098,-0.967153,-0.174429,-0.658115,0,0,-0.009079,-0.000944,-0.002434,-0.019226,-0.00926,119,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2321555,-0.702748,0.597851,-1.471391,0.916536,0.982143,0.793038,0.787402,-0.953012,-0.873697,-1.050985,-0.993754,-0.896136,-0.580008,0.756485,0,0,-0.009079,-0.000944,-0.002434,-0.019226,-0.00926,2663,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6057490,1.356782,-0.879239,1.043118,-1.577366,-1.595727,-1.809011,-1.88345,2.52525,-1.073163,-0.964694,-0.94927,-1.01509,-0.782798,-0.658115,0,0,-0.009079,-0.000944,-0.002434,-0.019226,-0.00926,1556,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2682393,-0.40853,1.620452,0.540216,-0.621661,-0.593222,-0.470061,-0.554925,-1.128842,1.662374,1.408289,1.452872,1.262784,0.02836,1.64061,0,0,-0.009079,-0.000944,-0.002434,-0.019226,-0.00926,4392,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [58]:
y.value_counts()

0    4335
1     665
Name: target, dtype: int64

## Model

In [0]:
def trainNormalModel(cur_model, X_data, y_data):
  X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state = 0)
  
  #We call our estimator instance clf, as it is a classifier.
  clf = cur_model.fit(X_train, y_train)
  
  y_pred = clf.predict(X_test)

  print(classification_report(y_test, y_pred))
  

In [0]:
def modelWithGridSearchCv(cur_model, hyperparameters, X_data, y_data):
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state = 0)
    gridSearchCV_clf = GridSearchCV(estimator=cur_model, param_grid=hyperparameters, cv= 5)
    best_model = gridSearchCV_clf.fit(X_train, y_train)
    print(gridSearchCV_clf.best_params_)

    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))

# 1 LogisticRegression

### 1.1 Normal 

In [61]:
trainNormalModel(LogisticRegression(random_state=0), X, y)

             precision    recall  f1-score   support

          0       0.92      0.99      0.96      1099
          1       0.88      0.38      0.53       151

avg / total       0.92      0.92      0.90      1250



In [62]:
trainNormalModel(LogisticRegression(random_state=0), X_pca, y)

             precision    recall  f1-score   support

          0       0.93      1.00      0.96      1099
          1       0.93      0.43      0.59       151

avg / total       0.93      0.93      0.92      1250



In [63]:
trainNormalModel(LogisticRegression(random_state=0), X_SelectKBest, y)

             precision    recall  f1-score   support

          0       0.93      0.99      0.96      1099
          1       0.87      0.45      0.59       151

avg / total       0.92      0.93      0.91      1250



In [0]:
### 1.2 GridsearchCv

In [0]:
# Create logistic regression
logistic = LogisticRegression()

In [0]:
# Create Hyperparameter Search Space
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [66]:
modelWithGridSearchCv(logistic, hyperparameters, X, y)

{'C': 1.0, 'penalty': 'l1'}
             precision    recall  f1-score   support

          0       0.93      1.00      0.96      1099
          1       0.94      0.48      0.64       151

avg / total       0.93      0.93      0.92      1250



In [0]:
modelWithGridSearchCv(logistic, hyperparameters, X_pca, y)

In [0]:
modelWithGridSearchCv(logistic, hyperparameters, X_SelectKBest, y)

# 2 SVM

### 2.1 Normal

In [67]:
trainNormalModel(SVC(), X, y)

             precision    recall  f1-score   support

          0       0.88      1.00      0.94      1099
          1       0.00      0.00      0.00       151

avg / total       0.77      0.88      0.82      1250



  'precision', 'predicted', average, warn_for)


In [68]:
trainNormalModel(SVC(), X_pca, y)

             precision    recall  f1-score   support

          0       0.88      1.00      0.94      1099
          1       0.00      0.00      0.00       151

avg / total       0.77      0.88      0.82      1250



  'precision', 'predicted', average, warn_for)


In [69]:
trainNormalModel(SVC(), X_SelectKBest, y)

             precision    recall  f1-score   support

          0       0.93      1.00      0.96      1099
          1       0.99      0.44      0.61       151

avg / total       0.94      0.93      0.92      1250



### 2.2 GridsearchCv

In [0]:
svc = SVC()

In [0]:
hyperparameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}]

In [0]:
modelWithGridSearchCv(svc, hyperparameters, X, y)

In [0]:
modelWithGridSearchCv(svc, hyperparameters, X_pca, y)

In [0]:
modelWithGridSearchCv(svc, hyperparameters, X_SelectKBest, y)

# 3 Random Forest Classifier

### 3.1 Normal

In [0]:
random_forest = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0)

In [71]:
trainNormalModel(random_forest, X, y)

             precision    recall  f1-score   support

          0       0.88      1.00      0.94      1099
          1       0.00      0.00      0.00       151

avg / total       0.77      0.88      0.82      1250



  'precision', 'predicted', average, warn_for)


In [72]:
trainNormalModel(random_forest, X_pca, y)

             precision    recall  f1-score   support

          0       0.90      1.00      0.95      1099
          1       1.00      0.19      0.31       151

avg / total       0.91      0.90      0.87      1250



In [73]:
trainNormalModel(random_forest, X_SelectKBest, y)

             precision    recall  f1-score   support

          0       0.88      1.00      0.94      1099
          1       1.00      0.03      0.05       151

avg / total       0.90      0.88      0.83      1250



### 3.2 GridsearchCv

In [0]:
hyperparameters = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [0]:
modelWithGridSearchCv(random_forest, hyperparameters, X, y)

In [0]:
modelWithGridSearchCv(random_forest, hyperparameters, X_pca, y)

In [0]:
modelWithGridSearchCv(random_forest, hyperparameters, X_SelectKBest, y)

# 4 Decision Tree

### 4.1 Normal

In [74]:
X.columns

Index(['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime',
       'CRSArrTime', 'FlightNum', 'ActualElapsedTime', 'CRSElapsedTime',
       ...
       'Dest_TUL', 'Dest_TUS', 'Dest_TVC', 'Dest_TWF', 'Dest_TYR', 'Dest_TYS',
       'Dest_VPS', 'Dest_WRG', 'Dest_XNA', 'Dest_YUM'],
      dtype='object', length=505)

In [0]:
decision_tree = DecisionTreeClassifier(random_state = 0)

In [76]:
trainNormalModel(decision_tree, X, y)

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1099
          1       0.91      0.90      0.90       151

avg / total       0.98      0.98      0.98      1250



In [77]:
trainNormalModel(decision_tree, X_pca, y)

             precision    recall  f1-score   support

          0       0.97      0.96      0.96      1099
          1       0.72      0.78      0.75       151

avg / total       0.94      0.94      0.94      1250



In [78]:
trainNormalModel(decision_tree, X_SelectKBest, y)

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      1099
          1       0.95      0.95      0.95       151

avg / total       0.99      0.99      0.99      1250



### 4.2 GridsearchCv

In [0]:
hyperparameters = { 
    'max_depth': [1, 2, 3, 4, 5],
    'max_features': [1, 2, 3, 4]
}

In [80]:
modelWithGridSearchCv(decision_tree, hyperparameters, X, y)

{'max_depth': 4, 'max_features': 2}
             precision    recall  f1-score   support

          0       0.88      1.00      0.94      1099
          1       0.00      0.00      0.00       151

avg / total       0.77      0.88      0.82      1250



  'precision', 'predicted', average, warn_for)


In [81]:
modelWithGridSearchCv(decision_tree, hyperparameters, X_pca, y)

{'max_depth': 5, 'max_features': 3}
             precision    recall  f1-score   support

          0       0.94      0.96      0.95      1099
          1       0.65      0.59      0.62       151

avg / total       0.91      0.91      0.91      1250



In [82]:
modelWithGridSearchCv(decision_tree, hyperparameters, X_SelectKBest, y)

{'max_depth': 5, 'max_features': 4}
             precision    recall  f1-score   support

          0       0.88      0.99      0.93      1099
          1       0.40      0.05      0.09       151

avg / total       0.83      0.88      0.83      1250



# 5 BernoulliNB

### 5.1 Normal

In [0]:
bernoulliNB = BernoulliNB()

In [84]:
trainNormalModel(bernoulliNB, X, y)

             precision    recall  f1-score   support

          0       0.96      0.97      0.96      1099
          1       0.75      0.69      0.72       151

avg / total       0.93      0.94      0.93      1250



In [85]:
trainNormalModel(bernoulliNB, X_pca, y)

             precision    recall  f1-score   support

          0       0.94      0.91      0.93      1099
          1       0.47      0.58      0.52       151

avg / total       0.88      0.87      0.88      1250



In [86]:
trainNormalModel(bernoulliNB, X_SelectKBest, y)

             precision    recall  f1-score   support

          0       0.96      0.97      0.97      1099
          1       0.75      0.74      0.74       151

avg / total       0.94      0.94      0.94      1250



### 5.2 GridsearchCv

In [0]:
hyperparameters = { 
    'alpha':[1, 10, 100]
}

In [88]:
modelWithGridSearchCv(bernoulliNB, hyperparameters, X, y)

{'alpha': 1}
             precision    recall  f1-score   support

          0       0.96      0.97      0.96      1099
          1       0.75      0.69      0.72       151

avg / total       0.93      0.94      0.93      1250



In [89]:
modelWithGridSearchCv(bernoulliNB, hyperparameters, X_pca, y)

{'alpha': 1}
             precision    recall  f1-score   support

          0       0.94      0.91      0.93      1099
          1       0.47      0.58      0.52       151

avg / total       0.88      0.87      0.88      1250



In [90]:
modelWithGridSearchCv(bernoulliNB, hyperparameters, X_SelectKBest, y)

{'alpha': 1}
             precision    recall  f1-score   support

          0       0.96      0.97      0.97      1099
          1       0.75      0.74      0.74       151

avg / total       0.94      0.94      0.94      1250



In [0]:
# 6 GradientBoostingClassifier

### 6.1 Normal

In [0]:
gradient_boosting = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.25, max_features=2, max_depth = 2, random_state = 0)

In [92]:
trainNormalModel(gradient_boosting, X, y)

             precision    recall  f1-score   support

          0       0.88      1.00      0.94      1099
          1       1.00      0.04      0.08       151

avg / total       0.90      0.88      0.83      1250



In [93]:
trainNormalModel(gradient_boosting, X_pca, y)

             precision    recall  f1-score   support

          0       0.88      1.00      0.94      1099
          1       0.89      0.05      0.10       151

avg / total       0.89      0.88      0.84      1250



In [94]:
trainNormalModel(gradient_boosting, X_SelectKBest, y)

             precision    recall  f1-score   support

          0       0.88      1.00      0.94      1099
          1       0.00      0.00      0.00       151

avg / total       0.77      0.88      0.82      1250



  'precision', 'predicted', average, warn_for)


### 6.2 GridsearchCv

In [0]:
hyperparameters = {
    'n_estimators':range(20,51,10)
}

In [96]:
modelWithGridSearchCv(gradient_boosting, hyperparameters, X, y)

{'n_estimators': 50}
             precision    recall  f1-score   support

          0       0.89      1.00      0.94      1099
          1       0.93      0.09      0.16       151

avg / total       0.89      0.89      0.85      1250



In [97]:
modelWithGridSearchCv(gradient_boosting, hyperparameters, X_pca, y)

{'n_estimators': 50}
             precision    recall  f1-score   support

          0       0.91      1.00      0.95      1099
          1       0.90      0.31      0.46       151

avg / total       0.91      0.91      0.89      1250



In [98]:
modelWithGridSearchCv(gradient_boosting, hyperparameters, X_SelectKBest, y)

{'n_estimators': 50}
             precision    recall  f1-score   support

          0       0.94      0.99      0.96      1099
          1       0.87      0.56      0.68       151

avg / total       0.93      0.94      0.93      1250



In [0]:
#https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/