# C2

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, GridSearchCV

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier

In [4]:
train = pd.read_csv('flight_delays_train.csv')
test = pd.read_csv('flight_delays_test.csv')
all_data = pd.concat([train, test], ignore_index=True)

In [5]:
# change target name to make it easier
train = train.rename(columns={'dep_delayed_15min':'delayed'})
all_data = all_data.rename(columns={'dep_delayed_15min':'delayed'})

In [6]:
# change target to numerical N-->0 & Y-->1
train.loc[(train.delayed == 'N'), 'delayed'] = 0
train.loc[(train.delayed == 'Y'), 'delayed'] = 1
all_data.loc[(all_data.delayed == 'N'), 'delayed'] = 0
all_data.loc[(all_data.delayed == 'Y'), 'delayed'] = 1

In [7]:
train.Month = train.Month.str.slice(start=2).astype(int)
all_data.Month = all_data.Month.str.slice(start=2).astype(int)

train.DayofMonth = train.DayofMonth.str.slice(start=2).astype(int)
all_data.DayofMonth = all_data.DayofMonth.str.slice(start=2).astype(int)

train.DayOfWeek = train.DayOfWeek.str.slice(start=2).astype(int)
all_data.DayOfWeek = all_data.DayOfWeek.str.slice(start=2).astype(int)

## Feature Engineering

In [8]:
all_data['Route'] = all_data['Origin'] + all_data['Dest']
all_data['UniqueCarrier_Origin'] = all_data['UniqueCarrier'] + "_" + all_data['Origin']
all_data['UniqueCarrier_Dest'] = all_data['UniqueCarrier'] + "_" + all_data['Dest']
all_data['is_weekend'] = (all_data['DayOfWeek'] == 6) | (all_data['DayOfWeek'] == 7)

# Hour and minute
all_data['hour'] = all_data['DepTime'] // 100
all_data.loc[all_data['hour'] == 24, 'hour'] = 0
all_data.loc[all_data['hour'] == 25, 'hour'] = 1
all_data['minute'] = all_data['DepTime'] % 100

# give more importance to hour variable
all_data['hour_sq'] = all_data['hour'] ** 2
all_data['hour_sq2'] = all_data['hour'] ** 4

## Binning
### Season

In [9]:
all_data['summer'] = (all_data['Month'].isin([6, 7, 8]))
all_data['autumn'] = (all_data['Month'].isin([9, 10, 11]))
all_data['winter'] = (all_data['Month'].isin([12, 1, 2]))
all_data['spring'] = (all_data['Month'].isin([3, 4, 5]))

### Departure Time

In [10]:
all_data['DayTime'] = 0
all_data.loc[all_data.DepTime <= 600 , 'DepTime_bin'] = 'Night'
all_data.loc[(all_data.DepTime > 600) & (all_data.DepTime <= 1200), 'DepTime_bin'] = 'Morning'
all_data.loc[(all_data.DepTime > 1200) & (all_data.DepTime <= 1800), 'DepTime_bin'] = 'Afternoon'
all_data.loc[(all_data.DepTime > 1800) & (all_data.DepTime <= 2600), 'DepTime_bin'] = 'Evening'

all_data['DepTime_bin'] = 0
all_data.loc[all_data.DepTime <= 600 , 'DepTime_bin'] = 'vem'
all_data.loc[(all_data.DepTime > 600) & (all_data.DepTime <= 900), 'DepTime_bin'] = 'm'
all_data.loc[(all_data.DepTime > 900) & (all_data.DepTime <= 1200), 'DepTime_bin'] = 'mm'
all_data.loc[(all_data.DepTime > 1200) & (all_data.DepTime <= 1500), 'DepTime_bin'] = 'maf'
all_data.loc[(all_data.DepTime > 1500) & (all_data.DepTime <= 1800), 'DepTime_bin'] = 'af'
all_data.loc[(all_data.DepTime > 1800) & (all_data.DepTime <= 2100), 'DepTime_bin'] = 'n'
all_data.loc[(all_data.DepTime > 2100) & (all_data.DepTime <= 2400), 'DepTime_bin'] = 'nn'
all_data.loc[all_data.DepTime > 2400, 'DepTime_bin'] = 'lm'
all_data = all_data.drop(['DepTime'], axis=1)

### Distance

In [11]:
all_data['Dist_bin'] = 0
all_data.loc[all_data.Distance <= 500 , 'Dist_bin'] = 'vshort'
all_data.loc[(all_data.Distance > 500) & (all_data.Distance <= 1000), 'Dist_bin'] = 'short'
all_data.loc[(all_data.Distance > 1000) & (all_data.Distance <= 1500), 'Dist_bin'] = 'mid'
all_data.loc[(all_data.Distance > 1500) & (all_data.Distance <= 2000), 'Dist_bin'] = 'midlong'
all_data.loc[(all_data.Distance > 2000) & (all_data.Distance <= 2500), 'Dist_bin'] = 'long'
all_data.loc[all_data.Distance > 2500, 'Dist_bin'] = 'vlong'
all_data = all_data.drop(['Distance'], axis=1)

## Predictive Modeling

In [12]:
new_train = all_data.iloc[:100000]
new_test = all_data.iloc[100000:]

feature_columns = list(new_train.columns)
feature_columns.remove('delayed')

X = new_train[feature_columns]
y = new_train.delayed

#split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= 0.2, random_state=1)

In [13]:
all_data.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,delayed,Route,UniqueCarrier_Origin,UniqueCarrier_Dest,...,minute,hour_sq,hour_sq2,summer,autumn,winter,spring,DayTime,DepTime_bin,Dist_bin
0,8,21,7,AA,ATL,DFW,0,ATLDFW,AA_ATL,AA_DFW,...,34,361,130321,True,False,False,False,0,n,short
1,4,20,3,US,PIT,MCO,0,PITMCO,US_PIT,US_MCO,...,48,225,50625,False,False,False,True,0,af,short
2,9,2,5,XE,RDU,CLE,0,RDUCLE,XE_RDU,XE_CLE,...,22,196,38416,False,True,False,False,0,maf,vshort
3,11,25,6,OO,DEN,MEM,0,DENMEM,OO_DEN,OO_MEM,...,15,100,10000,False,True,False,False,0,mm,short
4,10,7,6,WN,MDW,OMA,1,MDWOMA,WN_MDW,WN_OMA,...,28,324,104976,False,True,False,False,0,n,vshort


In [19]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80000 entries, 78689 to 98539
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 80000 non-null  int32 
 1   DayofMonth            80000 non-null  int32 
 2   DayOfWeek             80000 non-null  int32 
 3   UniqueCarrier         80000 non-null  object
 4   Origin                80000 non-null  object
 5   Dest                  80000 non-null  object
 6   Route                 80000 non-null  object
 7   UniqueCarrier_Origin  80000 non-null  object
 8   UniqueCarrier_Dest    80000 non-null  object
 9   is_weekend            80000 non-null  bool  
 10  hour                  80000 non-null  int64 
 11  minute                80000 non-null  int64 
 12  hour_sq               80000 non-null  int64 
 13  hour_sq2              80000 non-null  int64 
 14  summer                80000 non-null  bool  
 15  autumn                80000 non-null 

### Catboost

In [21]:
model_ctb = CatBoostClassifier(iterations=3000, loss_function='Logloss',
                               l2_leaf_reg=0.8, od_type='Iter',
                               random_seed=17, silent=True)
#model_ctb = GridSearchCV(model_ctb, {'learning_rate':[0.5, 0.1], 'n_estimators':[500, 1000]})
model_ctb.fit(X_train, y_train.astype(int), cat_features=[3, 4, 5, 6, 7, 8, 9, 14, 15, 16, 17, 19, 20])
predictions = model_ctb.predict_proba(X_val)[:, 1]
accuracy = roc_auc_score(y_val.astype(int), predictions)
print('Accuracy Catboost: ', accuracy)

Accuracy Catboost:  0.7775538008505711


## Results

In [22]:
model_ctb.fit(X, y.astype(int), cat_features=[3, 4, 5, 6, 7, 8, 9, 14, 15, 16, 17, 19, 20])
predictions = model_ctb.predict_proba(new_test[feature_columns])[:, 1]

In [23]:
submission = pd.DataFrame({'id':range(100000),'dep_delayed_15min':predictions})
submission.to_csv('delay.csv', index=False)