In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
pd.options.display.max_rows = 100
pd.options.display.max_columns = 200

In [2]:
df = pd.read_excel('../input/case-data/Case data.xlsx')
delay_codes = pd.read_excel('../input/case-data/Case data.xlsx', sheet_name='Delay Group code')
delay_codes.columns = ['code', 'reason', 'workgroup', 'controllable']

In [3]:
df.head(1).T

Unnamed: 0,0
schd_dep_date,2021-10-01 00:00:00
flight_number,OP0654
reg,A6-BMF
fleet_family,B787
sub_fleet,B787-10
sched_dep_iata,CAI
dep_country,Egypt
dep_region,Africa
sched_arr_iata,AUH
arr_region,Asia


In [4]:
df.dtypes.value_counts()

object            33
int64             24
float64           16
datetime64[ns]    12
dtype: int64

## Preprocessing

In [5]:
df.dropna(axis=1, how='all', inplace=True)
df.drop('ttl_infants', axis=1, inplace=True)
df.drop('std_z_year', axis=1, inplace=True)

## Fixing Target

In [6]:
## fixing the departure and arrival delay in minutes
df['ttl_dep_dly_fix'] = (df['atd_z_date_time'] - df['std_z_date_time']).transform(lambda x: x.total_seconds()/60)
df['ttl_arr_dly_fix'] = (df['ata_z_date_time'] - df['sta_z_date_time']).transform(lambda x: x.total_seconds()/60)
# df['ttl_dep_dly_fix'] = [x.hour * 60 + x.minute for x in df['ttl_dep_dly']]
# df['ttl_arr_dly_fix'] = [x.hour * 60 + x.minute if x is not np.nan else np.nan for x in df['ttl_arr_dly']]

## target. on-time if it actual departed within 15 minutes of scheduled departure.
df['otp'] = np.where((df['ttl_dep_dly_fix'] > 15), 0, 1)

In [7]:
## There are some that did not depart
df['departed'] = np.where(~df['atd_z_date_time'].isnull(), 1, 0)

## Passenger Flights
df['passenger'] = [1 if x == 'J' else 0 for x in df['flight_type']]

In [8]:
## Routes
df['route'] = df['sched_dep_iata'] + '-' + df['sched_arr_iata']

## Bucketting distances according to quadrants
df['flight_distance'] = pd.cut(df['distance'], 5, 
    labels=["ultrashort", "short", "medium", "long", "ultralong"])

## unique id
df['unique_id'] = (df['flight_number'] + '-' +
    df['schd_dep_date'].dt.strftime('%Y-%m-%d') + '-' +df['route'])

## Fixing Empty Seats
## fillna missing ttl_seats by sub_fleet
df['ttl_seats'] = df['ttl_seats'].groupby(df['sub_fleet']).transform('max') 
df['ttl_seats'].fillna(0, inplace=True)

In [9]:
target = 'otp'
num = [x for x in df.columns if df.dtypes[x] in ('float', 'int')]
cat = [x for x in df.columns if df.dtypes[x] == 'object']
tim = [x for x in df.columns if df.dtypes[x] == 'datetime64[ns]']
err_code = ['c1_1','c2_1', 'c3_1','c4_1']
err_dly = ['dly1','dly2', 'dly3','dly4']
err_desc = ['c1_desc','c2_desc', 'c3_desc','c4_desc']
delay_info = err_code + err_dly + err_desc
num.remove(target)

In [10]:
ps = df.query('passenger==1 & departed==1').reset_index(drop=True).copy()

In [11]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ps, test_size=.2, random_state=0)

# Explore

## Summary Notes
- Currently, OTP for passenger flights are at 71%.
- Out of all 215 flights that experienced delay, 31 are within our control.
- By focusing on these, we can improve OTP by 75%.

Notes:
- Include only passenger.
- Include only flights that departed.
- There are some passenger_flights that flew even though ttl_seats=0. I have filled this value referencing the total expected seats of the sub_fleet.
- ttl_dep_dly is wrong. Calculating manually using sched dep minus actual dep. Same for arrival.

In [12]:
corr = ps.corr()
corr[target].sort_values(ascending=False)

otp                    1.000000
ttl_fd_crew            0.061870
distance               0.044979
f_seats                0.043119
f_pax_rev              0.027675
block_difference       0.027079
dep_fuel               0.013922
mails                 -0.002803
arrival_fuel          -0.006845
ib_connex_pax_total   -0.011973
ttl_ops_crew          -0.016958
ttl_cc_crew           -0.040046
ttl_pax_pad           -0.051233
cargo                 -0.078458
arr_fuel              -0.090981
j_seats               -0.098023
ttl_seats             -0.114550
fuel_remaining        -0.115689
y_seats               -0.122710
lcl_term_pax_ttl      -0.136984
ob_connex_pax_total   -0.155201
j_pax_rev             -0.155850
bags_kgs              -0.214073
lcl_joining_pax_ttl   -0.231287
y_pax_rev             -0.231902
ttl_pax_rev           -0.238825
payload               -0.240344
ttl_arr_dly_fix       -0.592944
ttl_dep_dly_fix       -0.711498
departed                    NaN
passenger                   NaN
Name: ot

## Delay Codes

In [13]:
def stackDelay(frame, label, desc, value):
    submit = []
    for cde, desc, dly in zip(ps[label], ps[desc], ps[value]):
        elm = frame[['unique_id', cde, desc, dly]]
        elm.columns = ['unique_id', 'code', 'desc', 'value']
        submit.append(elm)
    submit = pd.concat(submit).dropna(subset=['code'])
    submit['value'] = [x.hour * 60 + x.minute for x in submit['value']]
    submit['desc'] = submit['desc'].fillna('empty')
    return submit

In [14]:
flight_dly = stackDelay(ps, err_code, err_desc, err_dly)

## Get code definition from list.
flight_dly = flight_dly.join(delay_codes.set_index('code'), on='code')
## pivot controllable factors
flight_dly = flight_dly.pivot_table(index='unique_id', values='value'
    , columns='controllable', aggfunc='sum', fill_value=0)
## sum controllable factors
flight_dly['sum_dep_dly'] = flight_dly["N"] + flight_dly["Y"]

In [15]:
## combine with passenger frame
ps = ps.join(flight_dly, on='unique_id')
## null to zero
ps[['N', 'Y', 'sum_dep_dly']] = ps[['N', 'Y', 'sum_dep_dly']].fillna(0)
## calculate maximum otp
ps['otp_max'] = np.where((ps['N'] > 15), 0, 1)
ps['otp_label'] = np.where((ps['otp_max']==1) & (ps['otp']==0), 'potential',
                    np.where((ps['otp']==1) & (ps['otp']==1), 'on-time', 'xdelayed'))

In [16]:
ps.groupby(['otp', 'otp_max'])['otp'].count()

otp  otp_max
0    0          184
     1           31
1    1          529
Name: otp, dtype: int64

In [17]:
len(ps)

744

In [18]:
ps['otp_max'].mean()

0.7526881720430108

In [19]:
ps['otp'].mean()

0.7110215053763441

## Understanding Delay Codes

In [None]:
delc = stackDelay(ps, err_code, err_desc, err_dly)

delc = delc.join(delay_codes.set_index('code'), on='code')

delc['r_cat'] = delc['reason'].apply(lambda x: x.split('-')[0].strip())
delc['r_dets'] = delc['reason'].apply(lambda x: x.split('-')[-1].strip())

delc.loc[delc['r_cat'] == 'Airport/Govt Auth', 'r_cat'] = 'Airport/Govt Authorities'

In [None]:
delc['code_prm'] = delc['code'].apply(lambda x: x[0:2])
delc['code_sub'] = delc['code'].apply(lambda x: x[2:3])

In [None]:
delc.head()

In [None]:
(delc.query('controllable=="Y"')
 .groupby(['r_cat', 'workgroup','r_dets', 'desc'])
 [['value']].count())

In [None]:
(delc.query('controllable=="Y"')
 .groupby(['r_cat', 'workgroup','r_dets', 'desc'])
 [['value']].count().sort_values(by=['r_cat', 'value'], ascending=False)).loc['Pax & Bag']

In [None]:
(delc.query('controllable=="Y"')
 .groupby(['r_cat', 'workgroup', 'r_dets', 'desc'])
 [['value']].count().sort_values(by=['r_cat', 'value'], ascending=False)).loc['Aircraft & Ramp']

In [None]:
delc.query('controllable=="Y" & r_cat=="Other"')['desc']

# Submission

In [None]:
flights = ps.groupby('schd_dep_date')['passenger'].sum()
otp = ps.groupby('schd_dep_date')['otp'].sum()
otpmax = ps.groupby('schd_dep_date')['otp_max'].sum()
pd.DataFrame({
    'ontime': otp
    ,'delayed': - (flights - otp - (otpmax - otp))
    ,'potential': otpmax - otp
    ,'ttflights': flights
    ,'otp': otp.cumsum()
    ,'otp_max': otpmax.cumsum()
    ,'cumflights': flights.cumsum()
    ,'p_otp': otp.cumsum() / flights.cumsum()
    ,'p_otp_max': otpmax.cumsum() / flights.cumsum()
}).to_csv('cumsum.csv')

In [None]:
ps.to_csv('submission.csv', index=False)

In [None]:
delc.to_csv('delays.csv', index=False)

In [29]:
df.query('reg=="A6-BMB"')[['route', 'std_z_date','atd_z_date', 'flight_type',  'ttl_dep_dly_fix', 'ttl_arr_dly_fix']]

Unnamed: 0,route,std_z_date,atd_z_date,flight_type,ttl_dep_dly_fix,ttl_arr_dly_fix
9,AUH-LHR,2021-10-01,2021-10-01,J,34.0,24.0
15,AMS-AUH,2021-10-01,2021-10-01,J,34.0,28.0
52,LHR-AUH,2021-10-02,2021-10-02,J,35.0,23.0
71,FRA-AUH,2021-10-03,2021-10-03,J,25.0,2.0
86,AUH-FRA,2021-10-03,2021-10-03,J,18.0,11.0
171,LHR-AUH,2021-10-05,2021-10-05,J,28.0,15.0
194,AUH-CDG,2021-10-06,2021-10-06,J,0.0,5.0
209,ICN-AUH,2021-10-07,2021-10-07,J,56.0,65.0
324,FRA-AUH,2021-10-09,2021-10-09,J,14.0,-4.0
343,AUH-FRA,2021-10-09,2021-10-09,J,7.0,3.0
