## **Multiclass Classification**: If the plane was delayed, predict what type of delay it is (will be).


In [2]:
import os
import psycopg2
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np 

In [3]:
# Getting the sample data 10% from the extracted flights_delay data
df=pd.read_csv('flights_with_delay.csv', skiprows=lambda i: i % 10 !=0)

In [4]:
df['fl_date']=pd.to_datetime(df['fl_date'])
df['fl_year']=df['fl_date'].dt.year 
df['fl_month']=df['fl_date'].dt.month 
df['fl_day_of_week']=df['fl_date'].dt.weekday
df['dep_hour']=(df['crs_dep_time']/100).astype(int)
df['arr_hour']=(df['crs_arr_time']/100).astype(int)

In [5]:
df.shape

(300948, 48)

In [6]:
cols=['mkt_unique_carrier','mkt_carrier_fl_num','origin_airport_id', 'origin_city_name',
       'dest_airport_id', 'dest_city_name', 'crs_dep_time', 'dep_delay','crs_arr_time',  'arr_delay', 'crs_elapsed_time',
       'distance', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'fl_month', 'fl_day_of_week',
       'dep_hour', 'arr_hour']

In [7]:
df=df[cols]

In [8]:
df1=df.copy()

In [9]:
df1.dropna(axis=0, inplace=True)

In [10]:
df1.shape

(300919, 21)

In [11]:
# statistics of  airport average monthly average_delay time 
dep_airport=df.groupby(['origin_airport_id', 'fl_month', 'fl_day_of_week'])['arr_delay'].mean().reset_index()

In [12]:
data=pd.merge(df1,dep_airport,how='outer', on=['origin_airport_id','fl_month','fl_day_of_week'])

In [13]:
data.rename(columns=({'arr_delay_y':'airport_arr_delay'}), inplace=True)

In [14]:
# average arr delay by airport, month, day of week
airport_dest=df1.groupby(['dest_airport_id', 'fl_month', 'fl_day_of_week'])['arr_delay'].mean().reset_index()
data=pd.merge(data, airport_dest, how='outer', on=['dest_airport_id','fl_month','fl_day_of_week'])


In [15]:
data.rename(columns={'arr_delay':'airport_arr_delay'}, inplace=True)

In [16]:
# explore carrier distance delay
carrier=df1.groupby(['mkt_unique_carrier'])['arr_delay'].mean().reset_index()

In [17]:
# merge carrier delay data into the data table
data=pd.merge(data, carrier,how='outer', on=['mkt_unique_carrier'])


In [18]:
data.rename(columns={'arr_delay':'carrier_arr_delay'}, inplace=True)

In [19]:
# departure delay based on the time of the day
hour=df1.groupby(['dep_hour'])['dep_delay'].mean().reset_index()

In [20]:
# merge data with the hour for the arrive_delay due to the hour
data=pd.merge(data,hour,how='outer', on=['dep_hour'])

In [21]:
data.rename(columns={'dep_delay':'hour_dep_delay'}, inplace=True)

In [22]:
# explore arrive hour delay time
arrhour=df1.groupby(['arr_hour'])['arr_delay'].mean().reset_index()

In [23]:
# merge arrivehour into the data
data=pd.merge(data,arrhour,how='outer', on='arr_hour')  

In [24]:
data.rename(columns={'arr_delay':'arr_hour_delay'}, inplace=True)

In [25]:
dw_ori=pd.read_csv('weather_delay_factor.csv')
dw_arr=pd.read_csv('weather_arr_delay.csv')

In [26]:
data=pd.merge(data,dw_ori,on=['origin_city_name', 'fl_month'], how='outer')
data.rename(columns={'weather_delay_y':'wea_delay_ori', 'weather_delay_x': 'weather_delay'}, inplace=True)


In [27]:
data.dropna(axis=0,inplace=True)

In [28]:
data.shape

(299535, 28)

In [29]:
data.head()

Unnamed: 0.1,mkt_unique_carrier,mkt_carrier_fl_num,origin_airport_id,origin_city_name,dest_airport_id,dest_city_name,crs_dep_time,dep_delay_x,crs_arr_time,arr_delay_x,...,fl_day_of_week,dep_hour,arr_hour,airport_arr_delay,airport_arr_delay.1,carrier_arr_delay,dep_delay_y,arr_hour_delay,Unnamed: 0,wea_delay_ori
0,AS,2564.0,14771.0,"San Francisco, CA",14908.0,"Santa Ana, CA",1530.0,-4.0,1705.0,17.0,...,0.0,15.0,17.0,64.598291,39.375,49.500177,62.371223,67.489052,3034.0,66.945455
1,AS,1753.0,14771.0,"San Francisco, CA",14747.0,"Seattle, WA",1550.0,23.0,1755.0,15.0,...,1.0,15.0,17.0,41.180328,36.384615,49.500177,62.371223,67.489052,3034.0,66.945455
2,UA,539.0,14771.0,"San Francisco, CA",13830.0,"Kahului, HI",1500.0,67.0,1737.0,48.0,...,3.0,15.0,17.0,52.010309,149.0,78.913769,62.371223,67.489052,3034.0,66.945455
3,UA,5949.0,14771.0,"San Francisco, CA",10157.0,"Arcata/Eureka, CA",1555.0,50.0,1717.0,54.0,...,4.0,15.0,17.0,74.761905,47.0,78.913769,62.371223,67.489052,3034.0,66.945455
4,UA,5555.0,14771.0,"San Francisco, CA",10157.0,"Arcata/Eureka, CA",1555.0,38.0,1714.0,29.0,...,5.0,15.0,17.0,58.843137,29.0,78.913769,62.371223,67.489052,3034.0,66.945455


In [30]:
data.to_csv('data.csv')

# modeling process

In [31]:
# use this data for modeling
DATA=data.copy()

In [32]:
cols=['mkt_unique_carrier', 'mkt_carrier_fl_num',
       'origin_airport_id', 'dest_airport_id', 'fl_month', 'fl_day_of_week', 'dep_hour', 'arr_hour',
       'airport_arr_delay','carrier_arr_delay', 
     'crs_elapsed_time', 'arr_hour_delay', 'wea_delay_ori','delay_type', 'carrier_delay',
 'weather_delay',
 'nas_delay',
 'security_delay',
 'late_aircraft_delay']

In [33]:
delays=['carrier_delay',
 'weather_delay',
 'nas_delay',
 'security_delay',
 'late_aircraft_delay']

In [37]:
DATA.head()

Unnamed: 0.1,mkt_unique_carrier,mkt_carrier_fl_num,origin_airport_id,origin_city_name,dest_airport_id,dest_city_name,crs_dep_time,dep_delay_x,crs_arr_time,arr_delay_x,...,fl_day_of_week,dep_hour,arr_hour,airport_arr_delay,airport_arr_delay.1,carrier_arr_delay,dep_delay_y,arr_hour_delay,Unnamed: 0,wea_delay_ori
0,AS,2564.0,14771.0,"San Francisco, CA",14908.0,"Santa Ana, CA",1530.0,-4.0,1705.0,17.0,...,0.0,15.0,17.0,64.598291,39.375,49.500177,62.371223,67.489052,3034.0,66.945455
1,AS,1753.0,14771.0,"San Francisco, CA",14747.0,"Seattle, WA",1550.0,23.0,1755.0,15.0,...,1.0,15.0,17.0,41.180328,36.384615,49.500177,62.371223,67.489052,3034.0,66.945455
2,UA,539.0,14771.0,"San Francisco, CA",13830.0,"Kahului, HI",1500.0,67.0,1737.0,48.0,...,3.0,15.0,17.0,52.010309,149.0,78.913769,62.371223,67.489052,3034.0,66.945455
3,UA,5949.0,14771.0,"San Francisco, CA",10157.0,"Arcata/Eureka, CA",1555.0,50.0,1717.0,54.0,...,4.0,15.0,17.0,74.761905,47.0,78.913769,62.371223,67.489052,3034.0,66.945455
4,UA,5555.0,14771.0,"San Francisco, CA",10157.0,"Arcata/Eureka, CA",1555.0,38.0,1714.0,29.0,...,5.0,15.0,17.0,58.843137,29.0,78.913769,62.371223,67.489052,3034.0,66.945455


In [34]:
DATA['delay_type']=DATA[delays].idxmax(axis=1)

In [35]:
feature_cols=['mkt_unique_carrier', 'mkt_carrier_fl_num',
       'origin_airport_id', 'dest_airport_id', 'fl_month', 'fl_day_of_week', 'dep_hour', 'arr_hour',
       'airport_arr_delay','carrier_arr_delay', 
     'crs_elapsed_time', 'arr_hour_delay', 'wea_delay_ori','carrier_delay',
 'weather_delay',
 'nas_delay',
 'security_delay',
 'late_aircraft_delay', 'delay_type']

In [36]:
DATA=DATA[feature_cols]

In [41]:
DATA.head()

Unnamed: 0,mkt_unique_carrier,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,fl_month,fl_day_of_week,dep_hour,arr_hour,airport_arr_delay,airport_arr_delay.1,carrier_arr_delay,crs_elapsed_time,arr_hour_delay,wea_delay_ori,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,delay_type
0,AS,2564.0,14771.0,14908.0,4,0.0,15.0,17.0,64.598291,39.375,49.500177,95.0,67.489052,66.945455,0.0,0.0,17.0,0.0,0.0,nas_delay
1,AS,1753.0,14771.0,14747.0,4,1.0,15.0,17.0,41.180328,36.384615,49.500177,125.0,67.489052,66.945455,0.0,0.0,0.0,0.0,15.0,late_aircraft_delay
2,UA,539.0,14771.0,13830.0,4,3.0,15.0,17.0,52.010309,149.0,78.913769,337.0,67.489052,66.945455,8.0,0.0,0.0,0.0,40.0,late_aircraft_delay
3,UA,5949.0,14771.0,10157.0,4,4.0,15.0,17.0,74.761905,47.0,78.913769,82.0,67.489052,66.945455,0.0,0.0,4.0,0.0,50.0,late_aircraft_delay
4,UA,5555.0,14771.0,10157.0,4,5.0,15.0,17.0,58.843137,29.0,78.913769,79.0,67.489052,66.945455,29.0,0.0,0.0,0.0,0.0,carrier_delay


In [37]:
df=DATA.copy()

In [38]:
col_for_label_encoder=['mkt_unique_carrier','mkt_carrier_fl_num',
 'origin_airport_id',
 'dest_airport_id',
 'fl_month',
 'fl_day_of_week',
 'dep_hour',
 'arr_hour']


In [39]:
# label encode the categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from imblearn.over_sampling import SMOTE
encoder=LabelEncoder()


In [40]:
df[col_for_label_encoder]=df[col_for_label_encoder].apply(encoder.fit_transform)

In [133]:
df.head()

Unnamed: 0,mkt_unique_carrier,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,fl_month,fl_day_of_week,dep_hour,arr_hour,airport_arr_delay,airport_arr_delay.1,carrier_arr_delay,crs_elapsed_time,arr_hour_delay,wea_delay_ori,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,delay_type
0,1,2561,317,335,3,0,15,17,64.598291,39.375,49.500177,95.0,67.489052,66.945455,0.0,0.0,17.0,0.0,0.0,nas_delay
1,1,1750,317,320,3,1,15,17,41.180328,36.384615,49.500177,125.0,67.489052,66.945455,0.0,0.0,0.0,0.0,15.0,late_aircraft_delay
2,8,538,317,251,3,3,15,17,52.010309,149.0,78.913769,337.0,67.489052,66.945455,8.0,0.0,0.0,0.0,40.0,late_aircraft_delay
3,8,5945,317,7,3,4,15,17,74.761905,47.0,78.913769,82.0,67.489052,66.945455,0.0,0.0,4.0,0.0,50.0,late_aircraft_delay
4,8,5551,317,7,3,5,15,17,58.843137,29.0,78.913769,79.0,67.489052,66.945455,29.0,0.0,0.0,0.0,0.0,carrier_delay


In [41]:
y=df['delay_type']

In [47]:
y.describe()

count                  299535
unique                      5
top       late_aircraft_delay
freq                   117587
Name: delay_type, dtype: object

In [48]:
df.head()

Unnamed: 0,mkt_unique_carrier,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,fl_month,fl_day_of_week,dep_hour,arr_hour,airport_arr_delay,airport_arr_delay.1,carrier_arr_delay,crs_elapsed_time,arr_hour_delay,wea_delay_ori,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,delay_type
0,1,2561,317,335,3,0,15,17,64.598291,39.375,49.500177,95.0,67.489052,66.945455,0.0,0.0,17.0,0.0,0.0,nas_delay
1,1,1750,317,320,3,1,15,17,41.180328,36.384615,49.500177,125.0,67.489052,66.945455,0.0,0.0,0.0,0.0,15.0,late_aircraft_delay
2,8,538,317,251,3,3,15,17,52.010309,149.0,78.913769,337.0,67.489052,66.945455,8.0,0.0,0.0,0.0,40.0,late_aircraft_delay
3,8,5945,317,7,3,4,15,17,74.761905,47.0,78.913769,82.0,67.489052,66.945455,0.0,0.0,4.0,0.0,50.0,late_aircraft_delay
4,8,5551,317,7,3,5,15,17,58.843137,29.0,78.913769,79.0,67.489052,66.945455,29.0,0.0,0.0,0.0,0.0,carrier_delay


In [42]:
df=pd.get_dummies(df, columns=['delay_type'])

## assign X and y

In [50]:
df.drop(columns=delays, inplace=True)

In [51]:
X=df.iloc[:, :-5].values
y=df.iloc[:,-5:].values

### decision Tree --without scaler

In [53]:

X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.3)
smote=SMOTE()
X_resample,y_resmaple=smote.fit_sample(X_train, y_train)
regressor=DecisionTreeClassifier(random_state=0)

# without resmapling
regressor.fit(X_train,y_train)
y_tree=regressor.predict(X_test)
print('The decision tree classification_report is : ', metrics.classification_report(y_test,y_tree))

The decision tree classification_report is :                precision    recall  f1-score   support

           0       0.33      0.33      0.33     24026
           1       0.48      0.47      0.47     35301
           2       0.39      0.40      0.39     27362
           3       0.00      0.00      0.00       163
           4       0.07      0.07      0.07      3009

   micro avg       0.40      0.39      0.40     89861
   macro avg       0.26      0.25      0.25     89861
weighted avg       0.40      0.39      0.40     89861
 samples avg       0.39      0.39      0.39     89861



In [54]:
# with the rebalanced data: ---slightly increase the security delay and weather delay precision score and recall score
regressor.fit(X_resample,y_resmaple)
y_tree=regressor.predict(X_test)
print('The decision tree classification_report is : ', metrics.classification_report(y_test,y_tree))

The decision tree classification_report is :                precision    recall  f1-score   support

           0       0.33      0.33      0.33     24026
           1       0.48      0.45      0.46     35301
           2       0.39      0.39      0.39     27362
           3       0.02      0.03      0.02       163
           4       0.07      0.10      0.08      3009

   micro avg       0.39      0.39      0.39     89861
   macro avg       0.26      0.26      0.26     89861
weighted avg       0.40      0.39      0.39     89861
 samples avg       0.39      0.39      0.39     89861



## Random Forest Regression

In [148]:
scaler=StandardScaler()
X=scaler.fit_transform(X)

In [58]:
# split and train data and without the over_sampling
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.35)
clf=RandomForestClassifier(n_estimators=10)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.43      0.14      0.21     28094
           1       0.54      0.35      0.43     41253
           2       0.52      0.24      0.33     31929
           3       0.00      0.00      0.00       168
           4       0.13      0.01      0.01      3394

   micro avg       0.51      0.25      0.34    104838
   macro avg       0.32      0.15      0.20    104838
weighted avg       0.49      0.25      0.33    104838
 samples avg       0.25      0.25      0.25    104838



In [57]:
# with the over-sampling
over_sample=SMOTE()
clf=RandomForestClassifier(n_estimators=10)
X_sample, y_sample=over_sample.fit_sample(X_train, y_train)
clf.fit(X_sample,y_sample)
y_p=clf.predict(X_test)
print(metrics.classification_report(y_test,y_p))


              precision    recall  f1-score   support

           0       0.41      0.14      0.21     24026
           1       0.54      0.31      0.39     35301
           2       0.51      0.23      0.31     27362
           3       0.20      0.01      0.02       163
           4       0.11      0.01      0.02      3009

   micro avg       0.50      0.23      0.31     89861
   macro avg       0.36      0.14      0.19     89861
weighted avg       0.48      0.23      0.31     89861
 samples avg       0.23      0.23      0.23     89861



## KNN

In [59]:
# KNN modeling without oversampling
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train,y_train)
y_knn=knn.predict(X_test)
print(metrics.classification_report(y_test, y_knn))


              precision    recall  f1-score   support

           0       0.38      0.11      0.17     28094
           1       0.50      0.25      0.34     41253
           2       0.47      0.20      0.28     31929
           3       0.00      0.00      0.00       168
           4       0.16      0.00      0.01      3394

   micro avg       0.46      0.19      0.27    104838
   macro avg       0.30      0.11      0.16    104838
weighted avg       0.44      0.19      0.26    104838
 samples avg       0.19      0.19      0.19    104838



In [154]:
## with the oversampling data
knn.fit(X_resample,y_resmaple)
y_knn=knn.predict(X_test)
print(metrics.classification_report(y_test, y_knn))



              precision    recall  f1-score   support

           0       0.38      0.11      0.17     63962
           1       0.51      0.28      0.36     93996
           2       0.46      0.17      0.25     73254
           3       0.00      0.00      0.00       419
           4       0.14      0.00      0.01      7997

   micro avg       0.47      0.19      0.27    239628
   macro avg       0.30      0.11      0.16    239628
weighted avg       0.45      0.19      0.26    239628
 samples avg       0.19      0.19      0.19    239628




## OneVsRestClassifier

In [60]:
# without over-sampling
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier 
from sklearn.preprocessing import MultiLabelBinarizer
clf = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=4))
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
clf.fit(X_train, y_train)
y_=clf.predict(X_test)
print(metrics.classification_report(y_test,y_))

              precision    recall  f1-score   support

           0       0.57      0.06      0.11     28094
           1       0.59      0.41      0.49     41253
           2       0.62      0.23      0.33     31929
           3       0.00      0.00      0.00       168
           4       0.32      0.00      0.00      3394

   micro avg       0.60      0.25      0.35    104838
   macro avg       0.42      0.14      0.19    104838
weighted avg       0.58      0.25      0.32    104838
 samples avg       0.25      0.25      0.25    104838



In [157]:
# With the over sampling 
clf.fit(X_resample, y_resmaple)
y_=clf.predict(X_test)
print(metrics.classification_report(y_test,y_))


classficaion_report               precision    recall  f1-score   support

           0       0.52      0.07      0.12     63962
           1       0.57      0.41      0.48     93996
           2       0.61      0.21      0.31     73254
           3       0.00      0.00      0.00       419
           4       0.26      0.00      0.01      7997

   micro avg       0.58      0.24      0.34    239628
   macro avg       0.39      0.14      0.18    239628
weighted avg       0.56      0.24      0.32    239628
 samples avg       0.24      0.24      0.24    239628



## Use Target Variable as lableencoder for classification modeling

In [43]:
df3=DATA.copy()
df3.head()

Unnamed: 0,mkt_unique_carrier,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,fl_month,fl_day_of_week,dep_hour,arr_hour,airport_arr_delay,airport_arr_delay.1,carrier_arr_delay,crs_elapsed_time,arr_hour_delay,wea_delay_ori,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,delay_type
0,AS,2564.0,14771.0,14908.0,4,0.0,15.0,17.0,64.598291,39.375,49.500177,95.0,67.489052,66.945455,0.0,0.0,17.0,0.0,0.0,nas_delay
1,AS,1753.0,14771.0,14747.0,4,1.0,15.0,17.0,41.180328,36.384615,49.500177,125.0,67.489052,66.945455,0.0,0.0,0.0,0.0,15.0,late_aircraft_delay
2,UA,539.0,14771.0,13830.0,4,3.0,15.0,17.0,52.010309,149.0,78.913769,337.0,67.489052,66.945455,8.0,0.0,0.0,0.0,40.0,late_aircraft_delay
3,UA,5949.0,14771.0,10157.0,4,4.0,15.0,17.0,74.761905,47.0,78.913769,82.0,67.489052,66.945455,0.0,0.0,4.0,0.0,50.0,late_aircraft_delay
4,UA,5555.0,14771.0,10157.0,4,5.0,15.0,17.0,58.843137,29.0,78.913769,79.0,67.489052,66.945455,29.0,0.0,0.0,0.0,0.0,carrier_delay


In [44]:
df3['delay_type']=encoder.fit_transform(df3['delay_type'])
df3['delay_type'].value_counts()

1    117587
2     91204
0     80069
4     10156
3       519
Name: delay_type, dtype: int64

In [45]:
df3.columns

Index(['mkt_unique_carrier', 'mkt_carrier_fl_num', 'origin_airport_id',
       'dest_airport_id', 'fl_month', 'fl_day_of_week', 'dep_hour', 'arr_hour',
       'airport_arr_delay', 'airport_arr_delay', 'carrier_arr_delay',
       'crs_elapsed_time', 'arr_hour_delay', 'wea_delay_ori', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
       'delay_type'],
      dtype='object')

In [46]:
encoder_columns=['mkt_unique_carrier', 'mkt_carrier_fl_num', 'origin_airport_id',
       'dest_airport_id', 'fl_month', 'fl_day_of_week', 'dep_hour',
       'arr_hour']

In [47]:
df3[encoder_columns]=df3[encoder_columns].apply(encoder.fit_transform)

In [48]:
df3=df3[['mkt_unique_carrier', 'mkt_carrier_fl_num', 'origin_airport_id',
       'dest_airport_id', 'fl_month', 'fl_day_of_week', 'dep_hour',
       'arr_hour', 
       'wea_delay_ori',
        'delay_type']]

In [49]:
X=df3.iloc[:,:-1].values
y=df3['delay_type'].values

In [76]:
y

array([2, 1, 1, ..., 2, 2, 1])

In [51]:
smote=SMOTE()

In [52]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=0)
X_sam,y_sam=smote.fit_sample(X_train, y_train)


In [80]:
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier()
dtree_model.fit(X_train, y_train)
y_tree = dtree_model.predict(X_test) 
print(metrics.classification_report(y_test,y_tree))

## applying over_sample
dtree_model.fit(X_sam, y_sam)
y_tree2 = dtree_model.predict(X_test) 
print(metrics.classification_report(y_test,y_tree2))


              precision    recall  f1-score   support

           0       0.34      0.35      0.34     23922
           1       0.48      0.48      0.48     35467
           2       0.40      0.39      0.40     27349
           3       0.01      0.01      0.01       165
           4       0.07      0.08      0.07      2958

    accuracy                           0.40     89861
   macro avg       0.26      0.26      0.26     89861
weighted avg       0.41      0.40      0.40     89861

              precision    recall  f1-score   support

           0       0.33      0.35      0.34     23922
           1       0.48      0.46      0.47     35467
           2       0.40      0.39      0.40     27349
           3       0.01      0.01      0.01       165
           4       0.06      0.09      0.07      2958

    accuracy                           0.40     89861
   macro avg       0.26      0.26      0.26     89861
weighted avg       0.40      0.40      0.40     89861



## try SVM

In [81]:
scaler=StandardScaler()
X=scaler.fit_transform(X)
X

array([[-0.90077706, -0.11096779,  1.41444839, ...,  0.14121352,
         0.23529144,  0.09201441],
       [-0.90077706, -0.54457662,  1.41444839, ...,  0.14121352,
         0.23529144,  0.09201441],
       [ 0.92986928, -1.19258389,  1.41444839, ...,  0.14121352,
         0.23529144,  0.09201441],
       ...,
       [-0.90077706, -1.44601248,  0.08530788, ..., -1.16368826,
        -1.31385054, -1.30164137],
       [ 0.92986928, -1.41821018,  1.56781076, ..., -1.16368826,
        -1.31385054, -1.36101477],
       [ 0.92986928, -1.41821018,  1.56781076, ..., -1.16368826,
        -1.31385054, -1.36101477]])

In [None]:
from sklearn.svm import SVC 
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train) 
y_svm = svm_model_linear.predict(X_test) 
report =metrics.classification_report(y_test, y_svm)
print(report)



In [None]:
## applying over_sampling
from imblearn.over_sampling import SVMSMOTE
svmsmote=SVMSMOTE()
X_sam, y_sam=svmsmote.fit_sample(X_train, y_train)
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_sam, y_sam) 
svm_predictions = svm_model_linear.predict(X_test) 
report =metrics.classification_report(y_test, svm_predictions) 
print(cm1)

## KNN

In [59]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 7).fit(X_train, y_train) 
 
# creating a confusion matrix 
knn_predictions = knn.predict(X_test)  
cm = metrics.classification_report(y_test, knn_predictions) 
print(cm)


              precision    recall  f1-score   support

           0       0.33      0.34      0.34     23922
           1       0.46      0.54      0.49     35467
           2       0.41      0.35      0.37     27349
           3       0.25      0.01      0.01       165
           4       0.10      0.01      0.01      2958

    accuracy                           0.41     89861
   macro avg       0.31      0.25      0.25     89861
weighted avg       0.40      0.41      0.40     89861



###  try Naive Bayes Classifier


In [58]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB().fit(X_train, y_train) 
gnb_predictions = gnb.predict(X_test) 
cm = metrics.classification_report(y_test, gnb_predictions) 
print(cm)

# applying over_sampling
gnb = GaussianNB().fit(X_sam, y_sam) 
gnb_predictions = gnb.predict(X_test) 
cm1 = metrics.classification_report(y_test, gnb_predictions) 
print(cm1)

              precision    recall  f1-score   support

           0       0.37      0.17      0.23     23922
           1       0.46      0.73      0.57     35467
           2       0.38      0.32      0.35     27349
           3       0.00      0.00      0.00       165
           4       0.00      0.00      0.00      2958

    accuracy                           0.43     89861
   macro avg       0.24      0.24      0.23     89861
weighted avg       0.40      0.43      0.39     89861

              precision    recall  f1-score   support

           0       0.36      0.09      0.15     23922
           1       0.49      0.45      0.47     35467
           2       0.38      0.11      0.17     27349
           3       0.00      0.44      0.01       165
           4       0.04      0.26      0.07      2958

    accuracy                           0.25     89861
   macro avg       0.25      0.27      0.17     89861
weighted avg       0.40      0.25      0.28     89861



##  XGBoost

In [53]:
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier 
clf = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=4))
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print(metrics.classification_report(y_test, y_pred))




              precision    recall  f1-score   support

           0       0.46      0.23      0.31     23922
           1       0.50      0.78      0.61     35467
           2       0.52      0.42      0.47     27349
           3       0.50      0.01      0.01       165
           4       0.42      0.00      0.00      2958

    accuracy                           0.50     89861
   macro avg       0.48      0.29      0.28     89861
weighted avg       0.49      0.50      0.46     89861



In [56]:
print(metrics.confusion_matrix(y_test,y_pred))

[[ 5569 13381  4968     0     4]
 [ 2853 27682  4929     1     2]
 [ 3207 12600 11541     0     1]
 [   54    73    37     1     0]
 [  502  1652   799     0     5]]


In [55]:
# Applying the over_sampling
clf.fit(X_sam, y_sam)
y_psam=clf.predict(X_test)
print(metrics.classification_report(y_test, y_psam))


              precision    recall  f1-score   support

           0       0.45      0.23      0.31     23922
           1       0.50      0.76      0.60     35467
           2       0.51      0.43      0.46     27349
           3       0.02      0.02      0.02       165
           4       0.21      0.01      0.02      2958

    accuracy                           0.49     89861
   macro avg       0.34      0.29      0.28     89861
weighted avg       0.48      0.49      0.46     89861



## in summary: XGBoost combined with onevsrest classifer give the best prediction results. For 5 differenty types of delay, the average precision score is over 50%. Given the extreamly low percentage of the security delay and weather delay, it has achieved relatively high prediction results. In multi_class classification, the over-sampling did not increase the score significently. 