In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
pd.options.display.max_rows = 100
pd.options.display.max_columns = 200

In [2]:
df = pd.read_excel('../input/case-data/Case data.xlsx')
delay_codes = pd.read_excel('../input/case-data/Case data.xlsx', sheet_name='Delay Group code')

In [3]:
df.head(1).T

Unnamed: 0,0
schd_dep_date,2021-10-01 00:00:00
flight_number,OP0654
reg,A6-BMF
fleet_family,B787
sub_fleet,B787-10
sched_dep_iata,CAI
dep_country,Egypt
dep_region,Africa
sched_arr_iata,AUH
arr_region,Asia


In [4]:
df.dtypes.value_counts()

object            33
int64             24
float64           16
datetime64[ns]    12
dtype: int64

## Preprocessing

In [5]:
df.dropna(axis=1, how='all', inplace=True)
df.drop('ttl_infants', axis=1, inplace=True)
df.drop('std_z_year', axis=1, inplace=True)

## Fixing Target

In [6]:
## fixing the departure and arrival delay in minutes
df['ttl_dep_dly_fix'] = (df['atd_z_date_time'] - df['std_z_date_time']).transform(lambda x: x.total_seconds()/60)
df['ttl_arr_dly_fix'] = (df['ata_z_date_time'] - df['sta_z_date_time']).transform(lambda x: x.total_seconds()/60)
# df['ttl_dep_dly_fix'] = [x.hour * 60 + x.minute for x in df['ttl_dep_dly']]
# df['ttl_arr_dly_fix'] = [x.hour * 60 + x.minute if x is not np.nan else np.nan for x in df['ttl_arr_dly']]

## target. on-time if it actual departed within 15 minutes of scheduled departure.
df['otp'] = np.where((df['ttl_dep_dly_fix'] > 15), 0, 1)

In [7]:
## There are some that did not depart
df['departed'] = np.where(~df['atd_z_date_time'].isnull(), 1, 0)

## Passenger Flights
df['passenger'] = [1 if x == 'J' else 0 for x in df['flight_type']]

In [8]:
## Routes
df['route'] = df['sched_dep_iata'] + '-' + df['sched_arr_iata']

## Bucketting distances according to quadrants
df['flight_distance'] = pd.cut(
    df.query('passenger==1 & departed==1')['distance'],
    5, labels=["ultrashort", "short", "medium", "long", "ultralong"])

## unique id
df['unique_id'] = (df['flight_number'] + '-' + df['schd_dep_date'].dt.strftime('%Y-%m-%d') + '-' +df['route'])

## Fixing Empty Seats
df['ttl_seats'] = df['ttl_seats'].groupby(df['sub_fleet']).transform('max') ## fillna missing ttl_seats by sub_fleet
df['ttl_seats'].fillna(0, inplace=True)

In [9]:
target = 'otp'
num = [x for x in df.columns if df.dtypes[x] in ('float', 'int')]
cat = [x for x in df.columns if df.dtypes[x] == 'object']
tim = [x for x in df.columns if df.dtypes[x] == 'datetime64[ns]']
num.remove(target)

In [10]:
ps = df.query('passenger==1 & departed==1').reset_index(drop=True).copy()

In [11]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ps, test_size=.2, random_state=0)

# Explore

## Summary Notes
- Currently, OTP for passenger flights are at 71%.
- Out of all 215 flights that experienced delay, 31 are within our control.
- By focusing on these, we can improve OTP by 75%.

Notes:
- Data excludes flights that does not have departure data.
- Data only includes Passenger Flights.
- There are some passenger_flights that flew even though ttl_seats=0. I have filled this value referencing the total expected seats of the sub_fleet.
- ttl_dep_dly is wrong. Calculating manually using sched dep minus actual dep. Same for arrival.

In [12]:
corr = ps.corr()
corr[target].sort_values(ascending=False)

  corr = ps.corr()


otp                    1.000000
ttl_fd_crew            0.061870
distance               0.044979
f_seats                0.043119
f_pax_rev              0.027675
block_difference       0.027079
dep_fuel               0.013922
mails                 -0.002803
arrival_fuel          -0.006845
ib_connex_pax_total   -0.011973
ttl_ops_crew          -0.016958
ttl_cc_crew           -0.040046
ttl_pax_pad           -0.051233
cargo                 -0.078458
arr_fuel              -0.090981
j_seats               -0.098023
ttl_seats             -0.114550
fuel_remaining        -0.115689
y_seats               -0.122710
lcl_term_pax_ttl      -0.136984
ob_connex_pax_total   -0.155201
j_pax_rev             -0.155850
bags_kgs              -0.214073
lcl_joining_pax_ttl   -0.231287
y_pax_rev             -0.231902
ttl_pax_rev           -0.238825
payload               -0.240344
ttl_arr_dly_fix       -0.592944
ttl_dep_dly_fix       -0.711498
departed                    NaN
passenger                   NaN
Name: ot

## Delay Codes

In [13]:
def stackDelay(frame, label, value):
    crs = []
    for x, y in zip(label, value):
        elm = frame[['unique_id', x, y]].pivot_table(values=y, index='unique_id', columns=x, aggfunc='sum')
        elm = elm.stack().reset_index()
        elm.columns = ['unique_id', 'Code', 'value']
        crs.append(elm)
    
    submit = pd.concat(crs, ignore_index=True)
    submit['value'] = [x.hour * 60 + x.minute for x in submit['value']]
    return submit

err_code = ['c1_1','c2_1', 'c3_1','c4_1']
err_dly = ['dly1','dly2', 'dly3','dly4']
err_desc = ['c1_desc','c2_desc', 'c3_desc','c4_desc']
delay_info = err_code + err_dly + err_desc

flight_dly = stackDelay(ps, err_code, err_dly)

In [14]:
flight_dly = flight_dly.join(delay_codes.set_index('Code'), on='Code')
flight_dly = flight_dly.pivot_table(index='unique_id', values='value'
                              , columns='Controllable', aggfunc='sum', fill_value=0)
flight_dly['tally_dep_dly'] = flight_dly["N"] + flight_dly["Y"]

In [15]:
ps = ps.join(flight_dly, on='unique_id')

ps[['N', 'Y', 'tally_dep_dly']] = ps[['N', 'Y', 'tally_dep_dly']].fillna(0)

ps['otp_max'] = np.where((ps['N'] > 15), 0, 1)

In [16]:
ps.groupby(['otp', 'otp_max'])['otp'].count()

otp  otp_max
0    0          184
     1           31
1    1          529
Name: otp, dtype: int64

In [17]:
ps['otp_max'].mean()

0.7526881720430108

In [18]:
ps['otp'].mean()

0.7110215053763441

## Understanding Delay Codes

In [22]:
delc = stackDelay(ps, err_code, err_dly)

In [31]:
delc = delc.join(delay_codes.set_index('Code'), on='Code')

In [34]:
delc.head()

Unnamed: 0,unique_id,Code,value,Reason,OTP Working Group,Controllable
0,OP0005-2021-10-07-AUH-MUC,03A,2,Other - Flight departure within 3 minutes of STD,Others,N
1,OP0005-2021-10-10-AUH-MUC,87B,19,Airport/Govt Authorities - Airport IT system m...,AptAuth,N
2,OP0005-2021-10-27-AUH-MUC,89B,4,Airport/Govt Authorities - Awaiting start-up a...,AptAuth,N
3,OP0005-2021-10-30-AUH-MUC,89B,5,Airport/Govt Authorities - Awaiting start-up a...,AptAuth,N
4,OP0006-2021-10-02-MUC-AUH,03A,3,Other - Flight departure within 3 minutes of STD,Others,N


In [41]:
delc.groupby(['Controllable','Code', 'Reason'])['value'].count().sort_values(ascending=False)['N']

Code  Reason                                                                              
93R   Reactionary - Late aircraft rotation due Other                                          101
89B   Airport/Govt Authorities - Awaiting start-up and/or pushback clearance                   94
03A   Other - Flight departure within 3 minutes of STD                                         75
16I   Pax & Bag - Guest boarding                                                               34
16H   Pax & Bag - Guest settling on board                                                      32
86B   Airport/Govt Authorities - CIH authorities                                               29
91R   Reactionary - Awaiting guest and/or baggage from another late inbound Etihad flight      23
40A   Technical - Defects identified during transit (other than cabin/IFE items)               17
16J   Pax & Bag - Special needs handling                                                       14
85B   Airport/Govt Authorit

In [50]:
(delc.groupby(['Controllable','OTP Working Group', 'Reason'])['value']
 .agg(['count', 'sum', 'mean'])
 .sort_values(by=['Controllable', 'OTP Working Group', 'count'], ascending=False)).loc['Y']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,sum,mean
OTP Working Group,Reason,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Others,Other - No delay code received in NEO,7,136,19.428571
Others,Unassigned,1,14,14.0
Below Wing,Aircraft & Ramp - Errors or late/lack of loading staff,16,244,15.25
Below Wing,Pax & Bag - Late or incorrectly sorted transfer baggage,15,185,12.333333
Below Wing,Aircraft & Ramp - Late or lack of loading equipment,8,198,24.75
Below Wing,Cargo & Mail - Late/Lack of manpower and/or ground service equipment by GHA,6,85,14.166667
Below Wing,Pax & Bag - Late or incorrectly sorted local baggage,5,56,11.2
Below Wing,Aircraft & Ramp - Fuelling/Defuelling,2,34,17.0
Below Wing,Aircraft & Ramp - Late or lack of servicing equipment/staff,1,8,8.0
Below Wing,Aircraft & Ramp - Late or lack of technical ramp equipment/staff,1,8,8.0


In [24]:
delay_codes

Unnamed: 0,Code,Reason,OTP Working Group,Controllable
0,03A,Other - Flight departure within 3 minutes of STD,Others,N
1,04A,Other - No delay code received in NEO,Others,Y
2,04B,Other - TAS/Dispatcher poor coordination,NOC,Y
3,04C,Other - Technical inefficient/lack of communic...,Technical,Y
4,05B,Other - Standby delay code while delay reason ...,Others,N
...,...,...,...,...
161,96O,Reactionary - Abnormal operations,NOC,N
162,96R,Reactionary - Aircraft changes for reasons oth...,NOC,N
163,96T,Reactionary - Non-Mandatory Technical Defect R...,NOC,N
164,99A,Miscellaneous - No suitable code,Others,N


# Submission

In [19]:
ps.to_csv('submission.csv', index=False)