In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
pd.options.display.max_rows = 100
pd.options.display.max_columns = 200

In [2]:
df = pd.read_excel('../input/case-data/Case data.xlsx')
delay_codes = pd.read_excel('../input/case-data/Case data.xlsx', sheet_name='Delay Group code')

In [3]:
df.head(1).T

Unnamed: 0,0
schd_dep_date,2021-10-01 00:00:00
flight_number,OP0654
reg,A6-BMF
fleet_family,B787
sub_fleet,B787-10
sched_dep_iata,CAI
dep_country,Egypt
dep_region,Africa
sched_arr_iata,AUH
arr_region,Asia


In [4]:
df.dtypes.value_counts()

object            33
int64             24
float64           16
datetime64[ns]    12
dtype: int64

## Preprocessing

In [6]:
df.dropna(axis=1, how='all', inplace=True)
df.drop('ttl_infants', axis=1, inplace=True)
df.drop('std_z_year', axis=1, inplace=True)

## Fixing Target

In [None]:
## fixing the departure and arrival delay in minutes
# df['ttl_dep_dly'] = (df['atd_z_date_time'] - df['std_z_date_time']).transform(lambda x: x.total_seconds()/60)
# df['ttl_arr_dly'] = (df['ata_z_date_time'] - df['sta_z_date_time']).transform(lambda x: x.total_seconds()/60)
df['ttl_dep_dly_fix'] = [x.hour * 60 + x.minute for x in df['ttl_dep_dly']]
df['ttl_arr_dly_fix'] = [x.hour * 60 + x.minute if x is not np.nan else np.nan for x in df['ttl_arr_dly']]

## target. on-time if it actual departed within 15 minutes of scheduled departure.
df['otp'] = np.where((df['ttl_dep_dly_fix'] <= 15) & (~df['std_z_date_time'].isnull()), 1, 0)

In [7]:
## There are some that did not depart
df['departed'] = np.where(~df['atd_z_date_time'].isnull(), 1, 0)

## Passenger Flights
df['passenger'] = [1 if x == 'J' else 0 for x in df['flight_type']]

In [9]:
## Routes
df['route'] = df['sched_dep_iata'] + '-' + df['sched_arr_iata']

## Bucketting distances according to quadrants
df['flight_distance'] = pd.cut(
    df.query('passenger==1 & departed==1')['distance'],
    5, labels=["ultrashort", "short", "medium", "long", "ultralong"])

## unique id
df['unique_id'] = (df['flight_number'] + '-' + df['schd_dep_date'].dt.strftime('%Y-%m-%d') + '-' +df['route'])

## Fixing Empty Seats
df['ttl_seats'] = df['ttl_seats'].groupby(df['sub_fleet']).transform('max') ## fillna missing ttl_seats by sub_fleet
df['ttl_seats'].fillna(0, inplace=True)

In [19]:
target = 'otp'
num = [x for x in df.columns if df.dtypes[x] in ('float', 'int')]
cat = [x for x in df.columns if df.dtypes[x] == 'object']
tim = [x for x in df.columns if df.dtypes[x] == 'datetime64[ns]']
num.remove(target)

In [20]:
ps = df.query('passenger==1 & departed==1').reset_index(drop=True).copy()

In [21]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ps, test_size=.2, random_state=0)

# Explore

## Summary Notes
- Currently, OTP for passenger flights are at 71.4%.
- Out of all 213 flights that experienced delay, 29 are within our control.
- By focusing on these, we can improve OTP by 75.3%.

Notes:
- Data excludes flights that does not have departure data.
- Data only includes Passenger Flights.
- There are some passenger_flights that flew even though ttl_seats=0.
    - I have filled this value referencing the total expected seats of the sub_fleet.
- There are different kinds of delay codes given to several flights.
    - I am just focusing only on the delay code in prim_c.
- ttl_dep_dly is wrong in some flights.

In [22]:
corr = ps.corr()
corr[target].sort_values(ascending=False)

otp                    1.000000
ttl_fd_crew            0.058738
f_seats                0.050210
distance               0.043043
f_pax_rev              0.039457
block_difference       0.023641
dep_fuel               0.016858
mails                 -0.004438
arrival_fuel          -0.007536
ib_connex_pax_total   -0.015416
ttl_ops_crew          -0.016274
ttl_cc_crew           -0.038227
ttl_pax_pad           -0.048552
cargo                 -0.071297
arr_fuel              -0.095304
j_seats               -0.095905
ttl_seats             -0.113277
fuel_remaining        -0.115672
y_seats               -0.124736
lcl_term_pax_ttl      -0.137739
j_pax_rev             -0.155202
ob_connex_pax_total   -0.155487
bags_kgs              -0.215420
lcl_joining_pax_ttl   -0.229803
y_pax_rev             -0.233126
payload               -0.236534
ttl_pax_rev           -0.239723
ttl_arr_dly_fix       -0.567718
ttl_dep_dly_fix       -0.701342
departed                    NaN
passenger                   NaN
Name: ot

## Delay Codes

In [26]:
def stackDelay(frame, label, value):
    crs = []
    for x, y in zip(label, value):
        elm = frame[['unique_id', x, y]].pivot_table(values=y, index='unique_id', columns=x, aggfunc='sum')
        elm = elm.stack().reset_index()
        elm.columns = ['unique_id', 'Code', 'value']
        crs.append(elm)
    
    submit = pd.concat(crs, ignore_index=True)
    submit['value'] = [x.hour * 60 + x.minute for x in submit['value']]
    return submit

err_code = ['c1_1','c2_1', 'c3_1','c4_1']
err_dly = ['dly1','dly2', 'dly3','dly4']
err_desc = ['c1_desc','c2_desc', 'c3_desc','c4_desc']
delay_info = err_code + err_dly + err_desc

flight_dly = stackDelay(ps, err_code, err_dly)

In [28]:
curious = flight_dly.join(delay_codes.set_index('Code'), on='Code')
curious = curious.pivot_table(index='unique_id', values='value', columns='Controllable', aggfunc='sum', fill_value=0)
curious['tally_dep_dly'] = curious["N"] + curious["Y"]
curious[['N', 'Y', 'tally_dep_dly']] = curious[['N', 'Y', 'tally_dep_dly']].fillna(0)

In [29]:
ps = ps.join(curious, on='unique_id')

In [30]:
checkps = ps.query("ttl_dep_dly_fix>0")[['tally_dep_dly', 'ttl_dep_dly_fix']]

In [31]:
checkps.loc[checkps['tally_dep_dly'] != checkps['ttl_dep_dly_fix']]

Unnamed: 0,tally_dep_dly,ttl_dep_dly_fix
244,64.0,23
330,14.0,8
447,18.0,10
555,34.0,27


In [32]:
ps.iloc[244]

schd_dep_date                                        2021-10-09 00:00:00
flight_number                                                     OP0206
reg                                                               A6-BMF
fleet_family                                                        B787
sub_fleet                                                        B787-10
sched_dep_iata                                                       AUH
dep_country                                         United Arab Emirates
dep_region                                                          Asia
sched_arr_iata                                                       BOM
arr_region                                                          Asia
std_z                                                           16:40:00
std_z_date                                           2021-10-09 00:00:00
std_z_date_time                                      2021-10-09 16:40:00
sta_z_date_time                                    

In [115]:
curious['tally_dep_dly'].sum()

9773

In [116]:
ps.query("tally_dep_dly>0")[['tally_dep_dly', 'ttl_dep_dly_fix']].sum()

tally_dep_dly      9773.0
ttl_dep_dly_fix    9683.0
dtype: float64

In [117]:
ps.loc[(ps['N']<15) & (ps['otp']==0)][['N', 'ttl_dep_dly_fix', 'otp']]

Unnamed: 0,N,ttl_dep_dly_fix,otp
0,8.0,27,0
5,0.0,31,0
17,0.0,20,0
26,14.0,29,0
112,0.0,33,0
120,13.0,26,0
180,0.0,35,0
207,0.0,46,0
267,10.0,27,0
353,0.0,20,0


In [118]:
ps['otp_max'] = np.where((ps['N'] <= 15), 1, 0)

In [119]:
ps.groupby('otp')[['otp', 'otp_max']].sum()

Unnamed: 0_level_0,otp,otp_max
otp,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,30
1,531,530


In [120]:
ps.query('otp==1 & otp_max==0')

Unnamed: 0,schd_dep_date,flight_number,reg,fleet_family,sub_fleet,sched_dep_iata,dep_country,dep_region,sched_arr_iata,arr_region,std_z,std_z_date,std_z_date_time,sta_z_date_time,etd_z_date_time,eta_z_date_time,atd_z,atd_z_date,atd_z_date_time,ata_z_date_time,sched_block,actual_block,block_difference,std_z_weekend,std_z_monthyear,sta_z_weekend,distance,flight_type,flight_category,prim_dly,prim_c,prim_c_desc,dly1,c1_1,c1_desc,dly2,c2_1,c2_desc,dly3,c3_1,c3_desc,dly4,c4_1,c4_desc,remarks,ttl_dep_dly,ttl_arr_dly,f_pax_rev,f_seats,j_pax_rev,j_seats,y_pax_rev,y_seats,ttl_pax_pad,ttl_fd_crew,ttl_cc_crew,ttl_ops_crew,ttl_pax_rev,ttl_seats,ib_connex_pax_total,ob_connex_pax_total,lcl_joining_pax_ttl,lcl_term_pax_ttl,arr_fuel,dep_fuel,cargo,mails,payload,bags_kgs,fuel_remaining,arrival_fuel,ttl_dep_dly_fix,ttl_arr_dly_fix,otp,departed,passenger,route,flight_distance,unique_id,N,Y,tally_dep_dly,otp_max
678,2021-10-29,OP0289,A6-BNC,B787,B787-9,AMD,India,Asia,AUH,Asia,22:40:00,2021-10-29,2021-10-29 22:40:00,2021-10-30 01:55:00,2021-10-29 23:00:00,2021-10-30 01:56:00,23:08:00,2021-10-29,2021-10-29 23:08:00,2021-10-30 01:59:00,03:15:00,02:51:00,-24.0,2021-10-30,2021-10-01,2021-10-30,990,J,,00:28:00,06B,Awaiting stand/slot at destination station,00:28:00,06B,Awaiting stand/slot at destination station,,,,,,,,,,,00:00:00,00:04:00,0,0,8,28,210,262,0.0,2,9,11,218,290,0,96,218,122,6100,19300,18210,0,42316,5057,7200.0,6100.0,0,4.0,1,1,1,AMD-AUH,ultrashort,OP0289-2021-10-29-AMD-AUH,28.0,0.0,28.0,0


In [121]:
ps['otp_max'].mean()

0.7526881720430108

In [122]:
ps['otp'].mean()

0.7137096774193549

# Submission

In [123]:
ps.to_csv('submission.csv', index=False)