In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

df = pd.read_csv('/kaggle/input/2023-us-civil-flights-delay-meteo-and-aircraft/US_flights_2023.csv')

In [2]:
df.head().T

Unnamed: 0,0,1,2,3,4
FlightDate,2023-01-02,2023-01-03,2023-01-04,2023-01-05,2023-01-06
Day_Of_Week,1,2,3,4,5
Airline,Endeavor Air,Endeavor Air,Endeavor Air,Endeavor Air,Endeavor Air
Tail_Number,N605LR,N605LR,N331PQ,N906XJ,N337PQ
Dep_Airport,BDL,BDL,BDL,BDL,BDL
Dep_CityName,"Hartford, CT","Hartford, CT","Hartford, CT","Hartford, CT","Hartford, CT"
DepTime_label,Morning,Morning,Morning,Morning,Morning
Dep_Delay,-3,-5,-5,-6,-1
Dep_Delay_Tag,0,0,0,0,0
Dep_Delay_Type,Low <5min,Low <5min,Low <5min,Low <5min,Low <5min


In [3]:
df.columns = df.columns.str.lower()
df['dep_otp15'] = np.where(df['dep_delay'] <= 15, 1, 0)

In [4]:
num = [x for x in df.columns if df.dtypes[x] in ('int32', 'int64', 'float32', 'float64')]
cat = [x for x in df.columns if df.dtypes[x] == 'object']
target = 'dep_otp15'
num.remove(target)

In [5]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2)

In [6]:
len(df)

6743404

In [7]:
len(train)

5394723

In [8]:
len(test)

1348681

# Data Science

In [9]:
df.groupby('airline')[target].agg(['count', 'mean', 'sum'])

Unnamed: 0_level_0,count,mean,sum
airline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alaska Airlines Inc.,242643,0.830364,201482
Allegiant Air,114425,0.770461,88160
American Airlines Inc.,928058,0.785835,729300
American Eagle Airlines Inc.,224695,0.849832,190953
Delta Air Lines Inc,972931,0.834972,812370
Endeavor Air,196905,0.870547,171415
Frontier Airlines Inc.,173459,0.695404,120624
Hawaiian Airlines Inc.,79842,0.806781,64415
JetBlue Airways,267915,0.693881,185901
PSA Airlines,191072,0.858268,163991


In [10]:
df.groupby('airline')[target].mean().sort_values(ascending=False)

airline
Republic Airways                0.895633
Endeavor Air                    0.870547
Skywest Airlines Inc.           0.861087
PSA Airlines                    0.858268
American Eagle Airlines Inc.    0.849832
Delta Air Lines Inc             0.834972
Alaska Airlines Inc.            0.830364
United Air Lines Inc.           0.806853
Hawaiian Airlines Inc.          0.806781
American Airlines Inc.          0.785835
Allegiant Air                   0.770461
Southwest Airlines Co.          0.764415
Spirit Air Lines                0.713411
Frontier Airlines Inc.          0.695404
JetBlue Airways                 0.693881
Name: dep_otp15, dtype: float64

In [11]:
df[num].corrwith(df[target]).sort_values(ascending=False)

aicraft_age          -0.007045
day_of_week          -0.027177
delay_security       -0.035377
flight_duration      -0.041338
delay_weather        -0.101854
delay_nas            -0.184385
delay_carrier        -0.272477
delay_lastaircraft   -0.370870
arr_delay            -0.522125
dep_delay            -0.534136
dep_delay_tag        -0.635566
dtype: float64

# Machine

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression

num_pip = Pipeline([
    ('impute', SimpleImputer())
    , ('scaler', StandardScaler())
])

cat_pip = Pipeline([
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

pipe = ColumnTransformer([
    ('num', num_pip, ['arr_delay', 'delay_lastaircraft', 'delay_carrier', 'delay_nas', 'delay_weather'])
    , ('cat', cat_pip, ['airline'])
])

pipe.fit(train)
train_pre = pipe.transform(train)
test_pre = pipe.transform(test)

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(train_pre, train[target])
preds = log.predict(train_pre)
print(cross_val_score(log, train_pre, train[target], scoring='accuracy').mean())

0.9463687014240053


In [14]:
print(classification_report(test[target], log.predict(test_pre)))

              precision    recall  f1-score   support

           0       0.94      0.78      0.85    265826
           1       0.95      0.99      0.97   1082855

    accuracy                           0.95   1348681
   macro avg       0.94      0.88      0.91   1348681
weighted avg       0.95      0.95      0.94   1348681

