Upon data study, this notebook will create new columns from flight delay claim dataset, that could be used as features in the prediction model.

In [1]:
# Used libraries
import pandas as pd
from datetime import datetime, timedelta

In [2]:
# Load the dataset
data_df = pd.read_csv('../datasets/flight_delays_data.csv')

# Check data size
data_df.shape

(899114, 10)

In [3]:
# Show some sample data
data_df.head()

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim
0,1582499,UO686,27,HKG,KIX,UO,10,0.4,2016-07-01,0
1,1582501,CI7868,17,HKG,TNN,CI,11,0.5,2015-04-23,0
2,1582504,PR301,14,HKG,MNL,PR,11,0.0,2014-04-08,0
3,1582508,LD327,37,HKG,SIN,LD,3,0.1,2013-09-15,0
4,1582509,KA5390,40,HKG,PEK,KA,9,0.5,2015-10-05,0


From the study, we are going to create different types of statistics that help with the prediction:

# Delay hours statistics

From the study, we are going to create statistics of delay hours from different perspectives:
- Departure
- Arrival
- Airline

Below are not tested now first ...
- Departure + Arrival
- Departure + Airline (i.e. ~= Airline in this dataset)
- Arrival + Airline
- Departure + Arrival + Airline

For each perspective, the average delay_time of per (hour/day/week) is computed. After that, deviation of consecutive last average delay_time is computed.

In [4]:
# Get flight datetime-related columns
data_df['flight_date_dt'] = data_df['flight_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
data_df['flight_dt'] = data_df.apply(lambda x: x['flight_date_dt'] + timedelta(hours=x['std_hour']), axis=1)
data_df['flight_year'] = data_df['flight_dt'].apply(lambda x: x.year)
data_df['flight_month'] = data_df['flight_dt'].apply(lambda x: x.month)
data_df['flight_day'] = data_df['flight_dt'].apply(lambda x: x.day)
data_df['flight_hour_bin'] = data_df['std_hour'].apply(lambda x: x // 4)

In [5]:
data_df.sample(10)

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,flight_date_dt,flight_dt,flight_year,flight_month,flight_day,flight_hour_bin
685868,2131702,HX528,28,HKG,HAN,HX,15,Cancelled,2016-07-12,800,2016-07-12,2016-07-12 15:00:00,2016,7,12,3
149414,463340,QR5844,50,HKG,NRT,QR,1,0.6,2015-12-14,0,2015-12-14,2015-12-14 01:00:00,2015,12,14,0
726033,2256281,CX510,21,HKG,TPE,CX,14,0.3,2016-05-23,0,2016-05-23,2016-05-23 14:00:00,2016,5,23,3
797193,2476241,AY5855,29,HKG,SIN,AY,20,0.0,2014-07-21,0,2014-07-21,2014-07-21 20:00:00,2014,7,21,5
419118,1303652,QR5818,15,HKG,MEL,QR,19,0.3,2016-04-13,0,2016-04-13,2016-04-13 19:00:00,2016,4,13,4
342059,1064351,S74851,11,HKG,BKK,S7,16,0.0,2014-03-16,0,2014-03-16,2014-03-16 16:00:00,2014,3,16,4
592988,1844478,FJ392,13,HKG,NAN,FJ,16,1.2,2014-03-27,0,2014-03-27,2014-03-27 16:00:00,2014,3,27,4
523265,1628467,CX659,46,HKG,SIN,CX,1,0.0,2013-11-15,0,2013-11-15,2013-11-15 01:00:00,2013,11,15,0
137723,427006,HX998,24,HKG,BKK,HX,12,Cancelled,2015-06-13,800,2015-06-13,2015-06-13 12:00:00,2015,6,13,3
319010,993251,CX5295,29,HKG,HAN,CX,17,0.3,2014-07-19,0,2014-07-19,2014-07-19 17:00:00,2014,7,19,4


In [6]:
# Before processing delay_time statistics, remove cancalled entries first
non_cancel_df = data_df[~(data_df['delay_time'] == "Cancelled")].copy()

# Translate delay_time to float
non_cancel_df['delay_time'] = non_cancel_df['delay_time'].apply(lambda x: float(x))

# Note: groupby preserves the order of rows within each group
non_cancel_df = non_cancel_df.sort_values(['Departure', 'Arrival', 'Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour'],ascending=False)

In [7]:
# Departure + hour
dep_hr_delay_df = non_cancel_df.groupby(['Departure', 'flight_year', 'flight_month', 'flight_day', 'std_hour']).mean()['delay_time'].reset_index()

In [8]:
# Arrival + hour
arr_hr_delay_df = non_cancel_df.groupby(['Arrival', 'flight_year', 'flight_month', 'flight_day', 'std_hour']).mean()['delay_time'].reset_index()

In [9]:
# Airline + hour
air_hr_delay_df = non_cancel_df.groupby(['Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour']).mean()['delay_time'].reset_index()

In [10]:
# Departure + day
dep_day_delay_df = non_cancel_df.groupby(['Departure', 'flight_year', 'flight_month', 'flight_day']).mean()['delay_time'].reset_index()

In [11]:
# Arrival + day
arr_day_delay_df = non_cancel_df.groupby(['Arrival', 'flight_year', 'flight_month', 'flight_day']).mean()['delay_time'].reset_index()

In [12]:
# Airline + day
air_day_delay_df = non_cancel_df.groupby(['Airline', 'flight_year', 'flight_month', 'flight_day']).mean()['delay_time'].reset_index()

In [13]:
# Departure + week
dep_wk_delay_df = non_cancel_df.groupby(['Departure', 'flight_year', 'Week']).mean()['delay_time'].reset_index()

In [14]:
# Arrival + week
arr_wk_delay_df = non_cancel_df.groupby(['Arrival', 'flight_year', 'Week']).mean()['delay_time'].reset_index()

In [15]:
# Airline + week
air_wk_delay_df = non_cancel_df.groupby(['Airline', 'flight_year', 'Week']).mean()['delay_time'].reset_index()

In [16]:
# For each stat df, create a time series key such that it is easier to select "most recent" value in later stage
# This key will not become a feature in model training

# Hour-based ts
def get_time_series_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    std_hour = row['std_hour']
    return flight_year * (10 ** 6) + flight_month * (10 ** 4) + flight_day * (10 ** 2) + std_hour

dep_hr_delay_df['flight_ts'] = dep_hr_delay_df.apply(get_time_series_val, axis=1)
arr_hr_delay_df['flight_ts'] = arr_hr_delay_df.apply(get_time_series_val, axis=1)
air_hr_delay_df['flight_ts'] = air_hr_delay_df.apply(get_time_series_val, axis=1)

In [17]:
# Day-based ts
def get_time_series_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    return flight_year * (10 ** 4) + flight_month * (10 ** 2) + flight_day

dep_day_delay_df['flight_day_ts'] = dep_day_delay_df.apply(get_time_series_val, axis=1)
arr_day_delay_df['flight_day_ts'] = arr_day_delay_df.apply(get_time_series_val, axis=1)
air_day_delay_df['flight_day_ts'] = air_day_delay_df.apply(get_time_series_val, axis=1)

In [18]:
# Week-based ts
def get_time_series_val(row):
    flight_year = row['flight_year']
    flight_wk = row['Week']
    return flight_year * (10 ** 2) + flight_wk

dep_wk_delay_df['flight_wk_ts'] = dep_wk_delay_df.apply(get_time_series_val, axis=1)
arr_wk_delay_df['flight_wk_ts'] = arr_wk_delay_df.apply(get_time_series_val, axis=1)
air_wk_delay_df['flight_wk_ts'] = air_wk_delay_df.apply(get_time_series_val, axis=1)

In [19]:
# Rename columns for join operation
dep_hr_delay_df.rename(columns={'delay_time': 'dep_hr_delay_time'}, inplace=True)
arr_hr_delay_df.rename(columns={'delay_time': 'arr_hr_delay_time'}, inplace=True)
air_hr_delay_df.rename(columns={'delay_time': 'air_hr_delay_time'}, inplace=True)

In [20]:
dep_day_delay_df.rename(columns={'delay_time': 'dep_day_delay_time'}, inplace=True)
arr_day_delay_df.rename(columns={'delay_time': 'arr_day_delay_time'}, inplace=True)
air_day_delay_df.rename(columns={'delay_time': 'air_day_delay_time'}, inplace=True)

In [21]:
dep_wk_delay_df.rename(columns={'delay_time': 'dep_wk_delay_time'}, inplace=True)
arr_wk_delay_df.rename(columns={'delay_time': 'arr_wk_delay_time'}, inplace=True)
air_wk_delay_df.rename(columns={'delay_time': 'air_wk_delay_time'}, inplace=True)

# Delay status statistics

In [45]:
# Before processing delay_time statistics, remove cancalled entries first
non_cancel_df = data_df[~(data_df['delay_time'] == "Cancelled")].copy()

# Translate delay_time to float
non_cancel_df['is_delayed'] = non_cancel_df['delay_time'].apply(lambda x: float(x) >= 3.0)

# Note: groupby preserves the order of rows within each group
non_cancel_df = non_cancel_df.sort_values(['Departure', 'Arrival', 'Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour'],ascending=False)

In [46]:
# Departure + hour
dep_hr_delay_count_df = non_cancel_df.groupby(['Departure', 'flight_year', 'flight_month', 'flight_day', 'std_hour']).count()['is_delayed'].reset_index()

In [49]:
# Arrival + hour
arr_hr_delay_count_df = non_cancel_df.groupby(['Arrival', 'flight_year', 'flight_month', 'flight_day', 'std_hour']).count()['is_delayed'].reset_index()

In [50]:
# Airline + hour
air_hr_delay_count_df = non_cancel_df.groupby(['Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour']).count()['is_delayed'].reset_index()

In [51]:
# For each stat df, create a time series key such that it is easier to select "most recent" value in later stage
# This key will not become a feature in model training
def get_time_series_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    std_hour = row['std_hour']
    return flight_year * (10 ** 6) + flight_month * (10 ** 4) + flight_day * (10 ** 2) + std_hour

dep_hr_delay_count_df['flight_ts'] = dep_hr_delay_count_df.apply(get_time_series_val, axis=1)
arr_hr_delay_count_df['flight_ts'] = arr_hr_delay_count_df.apply(get_time_series_val, axis=1)
air_hr_delay_count_df['flight_ts'] = air_hr_delay_count_df.apply(get_time_series_val, axis=1)

In [52]:
dep_hr_delay_count_df.rename(columns={'is_delayed': 'dep_hr_delay_count'}, inplace=True)
arr_hr_delay_count_df.rename(columns={'is_delayed': 'arr_hr_delay_count'}, inplace=True)
air_hr_delay_count_df.rename(columns={'is_delayed': 'air_hr_delay_count'}, inplace=True)

# Cancelled status statistics

In [22]:
# Before processing delay_time statistics, remove cancalled entries first
cancel_df = data_df[(data_df['delay_time'] == "Cancelled")].copy()

# Translate delay_time to float
cancel_df['is_cancel'] = cancel_df['delay_time'].apply(lambda x: x == "Cancelled")

# Note: groupby preserves the order of rows within each group
cancel_df = cancel_df.sort_values(['Departure', 'Arrival', 'Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour'],ascending=False)

In [23]:
cancel_df

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,flight_date_dt,flight_dt,flight_year,flight_month,flight_day,flight_hour_bin,is_cancel
806612,2505450,HX2206,13,HKG,ZYI,HX,17,Cancelled,2016-03-28,800,2016-03-28,2016-03-28 17:00:00,2016,3,28,4,True
566525,1762897,HX2206,9,HKG,ZYI,HX,17,Cancelled,2016-02-29,800,2016-02-29,2016-02-29 17:00:00,2016,2,29,4,True
149160,462548,HX2206,9,HKG,ZYI,HX,17,Cancelled,2016-02-26,800,2016-02-26,2016-02-26 17:00:00,2016,2,26,4,True
752647,2338383,HX2206,8,HKG,ZYI,HX,17,Cancelled,2016-02-19,800,2016-02-19,2016-02-19 17:00:00,2016,2,19,4,True
608406,1892235,HX2206,7,HKG,ZYI,HX,17,Cancelled,2016-02-12,800,2016-02-12,2016-02-12 17:00:00,2016,2,12,4,True
885495,2750955,HX2206,6,HKG,ZYI,HX,17,Cancelled,2016-02-08,800,2016-02-08,2016-02-08 17:00:00,2016,2,8,4,True
740698,2301325,HX2206,6,HKG,ZYI,HX,17,Cancelled,2016-02-05,800,2016-02-05,2016-02-05 17:00:00,2016,2,5,4,True
157370,487932,HX2206,5,HKG,ZYI,HX,17,Cancelled,2016-01-29,800,2016-01-29,2016-01-29 17:00:00,2016,1,29,4,True
631437,1963355,HX2206,4,HKG,ZYI,HX,17,Cancelled,2016-01-25,800,2016-01-25,2016-01-25 17:00:00,2016,1,25,4,True
526252,1637839,HX2206,4,HKG,ZYI,HX,17,Cancelled,2016-01-22,800,2016-01-22,2016-01-22 17:00:00,2016,1,22,4,True


In [24]:
# Departure + hour
dep_hr_cancel_df = cancel_df.groupby(['Departure', 'flight_year', 'flight_month', 'flight_day', 'std_hour']).count()['is_cancel'].reset_index()

In [25]:
# Arrival + hour
arr_hr_cancel_df = cancel_df.groupby(['Arrival', 'flight_year', 'flight_month', 'flight_day', 'std_hour']).count()['is_cancel'].reset_index()

In [26]:
# Airline + hour
air_hr_cancel_df = cancel_df.groupby(['Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour']).count()['is_cancel'].reset_index()

In [27]:
# For each stat df, create a time series key such that it is easier to select "most recent" value in later stage
# This key will not become a feature in model training
def get_time_series_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    std_hour = row['std_hour']
    return flight_year * (10 ** 6) + flight_month * (10 ** 4) + flight_day * (10 ** 2) + std_hour

dep_hr_cancel_df['flight_ts'] = dep_hr_cancel_df.apply(get_time_series_val, axis=1)
arr_hr_cancel_df['flight_ts'] = arr_hr_cancel_df.apply(get_time_series_val, axis=1)
air_hr_cancel_df['flight_ts'] = air_hr_cancel_df.apply(get_time_series_val, axis=1)

In [28]:
dep_hr_cancel_df.rename(columns={'is_cancel': 'dep_hr_cancel_count'}, inplace=True)
arr_hr_cancel_df.rename(columns={'is_cancel': 'arr_hr_cancel_count'}, inplace=True)
air_hr_cancel_df.rename(columns={'is_cancel': 'air_hr_cancel_count'}, inplace=True)

In [29]:
dep_hr_cancel_df

Unnamed: 0,Departure,flight_year,flight_month,flight_day,std_hour,dep_hr_cancel_count,flight_ts
0,HKG,2013,9,1,0,1,2013090100
1,HKG,2013,9,1,9,1,2013090109
2,HKG,2013,9,1,10,4,2013090110
3,HKG,2013,9,1,11,7,2013090111
4,HKG,2013,9,1,14,1,2013090114
5,HKG,2013,9,1,19,1,2013090119
6,HKG,2013,9,1,21,3,2013090121
7,HKG,2013,9,1,22,1,2013090122
8,HKG,2013,9,2,9,4,2013090209
9,HKG,2013,9,2,10,2,2013090210


Merging the statistics to original dataset ...

In [30]:
def get_last_time_series_val(row):
    flight_dt = row['flight_dt'] - timedelta(hours=1)
    return int(flight_dt.strftime("%Y%m%d%H"))

In [31]:
feature_df = data_df.copy()
feature_df['flight_ts'] = feature_df.apply(get_last_time_series_val, axis=1)

In [32]:
# Do the same for day/week
def get_last_day_time_series_val(row):
    flight_dt = row['flight_dt'] - timedelta(days=1)
    return int(flight_dt.strftime("%Y%m%d"))

feature_df['flight_day_ts'] = feature_df.apply(get_last_day_time_series_val, axis=1)

In [33]:
def get_number_of_weeks_in_year(year):
    last_week = datetime(year, 12, 28)
    return last_week.isocalendar()[1]

def get_last_wk_time_series_val(row):
    flight_year = row['flight_year']
    flight_week = row['Week'] - 1
    
    if flight_week < 1:
        # Shift to last year end
        flight_year -= 1
        flight_week = get_number_of_weeks_in_year(flight_year)
        
    return flight_year * (10 ** 2) + flight_week

feature_df['flight_wk_ts'] = feature_df.apply(get_last_wk_time_series_val, axis=1)

In [34]:
feature_df

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,flight_date_dt,flight_dt,flight_year,flight_month,flight_day,flight_hour_bin,flight_ts,flight_day_ts,flight_wk_ts
0,1582499,UO686,27,HKG,KIX,UO,10,0.4,2016-07-01,0,2016-07-01,2016-07-01 10:00:00,2016,7,1,2,2016070109,20160630,201626
1,1582501,CI7868,17,HKG,TNN,CI,11,0.5,2015-04-23,0,2015-04-23,2015-04-23 11:00:00,2015,4,23,2,2015042310,20150422,201516
2,1582504,PR301,14,HKG,MNL,PR,11,0.0,2014-04-08,0,2014-04-08,2014-04-08 11:00:00,2014,4,8,2,2014040810,20140407,201413
3,1582508,LD327,37,HKG,SIN,LD,3,0.1,2013-09-15,0,2013-09-15,2013-09-15 03:00:00,2013,9,15,0,2013091502,20130914,201336
4,1582509,KA5390,40,HKG,PEK,KA,9,0.5,2015-10-05,0,2015-10-05,2015-10-05 09:00:00,2015,10,5,2,2015100508,20151004,201539
5,1582511,NZ4851,10,HKG,IST,NZ,23,0.2,2015-03-10,0,2015-03-10,2015-03-10 23:00:00,2015,3,10,5,2015031022,20150309,201509
6,1582512,CX5626,51,HKG,HGH,CX,7,0.4,2015-12-19,0,2015-12-19,2015-12-19 07:00:00,2015,12,19,1,2015121906,20151218,201550
7,1582513,MH9725,8,HKG,KUL,MH,12,0.1,2014-02-23,0,2014-02-23,2014-02-23 12:00:00,2014,2,23,3,2014022311,20140222,201407
8,1582516,KA154,42,HKG,BLR,KA,21,0.0,2014-10-21,0,2014-10-21,2014-10-21 21:00:00,2014,10,21,5,2014102120,20141020,201441
9,1582517,CX233,10,HKG,MXP,CX,0,0.3,2015-03-05,0,2015-03-05,2015-03-05 00:00:00,2015,3,5,0,2015030423,20150304,201509


In [53]:
# Merge generated stat to original dataset
# The merge operation is based on last hr/day/wk from current row's datetime, thus it is assumed those statistics can be calculated for new predictions

# Delay time
merged_feature_df = feature_df.merge(dep_hr_delay_df[['Departure', 'flight_ts', 'dep_hr_delay_time']], how='left', left_on=['Departure', 'flight_ts'], right_on=['Departure', 'flight_ts'])
merged_feature_df = merged_feature_df.merge(arr_hr_delay_df[['Arrival', 'flight_ts', 'arr_hr_delay_time']], how='left', left_on=['Arrival', 'flight_ts'], right_on=['Arrival', 'flight_ts'])
merged_feature_df = merged_feature_df.merge(air_hr_delay_df[['Airline', 'flight_ts', 'air_hr_delay_time']], how='left', left_on=['Airline', 'flight_ts'], right_on=['Airline', 'flight_ts'])

In [54]:
merged_feature_df = merged_feature_df.merge(dep_day_delay_df[['Departure', 'flight_day_ts', 'dep_day_delay_time']], how='left', left_on=['Departure', 'flight_day_ts'], right_on=['Departure', 'flight_day_ts'])
merged_feature_df = merged_feature_df.merge(arr_day_delay_df[['Arrival', 'flight_day_ts', 'arr_day_delay_time']], how='left', left_on=['Arrival', 'flight_day_ts'], right_on=['Arrival', 'flight_day_ts'])
merged_feature_df = merged_feature_df.merge(air_day_delay_df[['Airline', 'flight_day_ts', 'air_day_delay_time']], how='left', left_on=['Airline', 'flight_day_ts'], right_on=['Airline', 'flight_day_ts'])

In [55]:
merged_feature_df = merged_feature_df.merge(dep_wk_delay_df[['Departure', 'flight_wk_ts', 'dep_wk_delay_time']], how='left', left_on=['Departure', 'flight_wk_ts'], right_on=['Departure', 'flight_wk_ts'])
merged_feature_df = merged_feature_df.merge(arr_wk_delay_df[['Arrival', 'flight_wk_ts', 'arr_wk_delay_time']], how='left', left_on=['Arrival', 'flight_wk_ts'], right_on=['Arrival', 'flight_wk_ts'])
merged_feature_df = merged_feature_df.merge(air_wk_delay_df[['Airline', 'flight_wk_ts', 'air_wk_delay_time']], how='left', left_on=['Airline', 'flight_wk_ts'], right_on=['Airline', 'flight_wk_ts'])

In [56]:
# Delay statistics
merged_feature_df = merged_feature_df.merge(dep_hr_delay_count_df[['Departure', 'flight_ts', 'dep_hr_delay_count']], how='left', left_on=['Departure', 'flight_ts'], right_on=['Departure', 'flight_ts'])
merged_feature_df = merged_feature_df.merge(arr_hr_delay_count_df[['Arrival', 'flight_ts', 'arr_hr_delay_count']], how='left', left_on=['Arrival', 'flight_ts'], right_on=['Arrival', 'flight_ts'])
merged_feature_df = merged_feature_df.merge(air_hr_delay_count_df[['Airline', 'flight_ts', 'air_hr_delay_count']], how='left', left_on=['Airline', 'flight_ts'], right_on=['Airline', 'flight_ts'])

In [57]:
# Cancel statistics
merged_feature_df = merged_feature_df.merge(dep_hr_cancel_df[['Departure', 'flight_ts', 'dep_hr_cancel_count']], how='left', left_on=['Departure', 'flight_ts'], right_on=['Departure', 'flight_ts'])
merged_feature_df = merged_feature_df.merge(arr_hr_cancel_df[['Arrival', 'flight_ts', 'arr_hr_cancel_count']], how='left', left_on=['Arrival', 'flight_ts'], right_on=['Arrival', 'flight_ts'])
merged_feature_df = merged_feature_df.merge(air_hr_cancel_df[['Airline', 'flight_ts', 'air_hr_cancel_count']], how='left', left_on=['Airline', 'flight_ts'], right_on=['Airline', 'flight_ts'])

In [58]:
merged_feature_df

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,...,air_day_delay_time,dep_wk_delay_time,arr_wk_delay_time,air_wk_delay_time,dep_hr_delay_count,arr_hr_delay_count,air_hr_delay_count,dep_hr_cancel_count,arr_hr_cancel_count,air_hr_cancel_count
0,1582499,UO686,27,HKG,KIX,UO,10,0.4,2016-07-01,0,...,0.721739,0.809908,0.446497,0.550909,64.0,,2.0,,,
1,1582501,CI7868,17,HKG,TNN,CI,11,0.5,2015-04-23,0,...,0.269231,0.356342,-0.080000,0.483168,49.0,,2.0,,,
2,1582504,PR301,14,HKG,MNL,PR,11,0.0,2014-04-08,0,...,0.400000,0.648924,0.686364,0.517949,46.0,1.0,,,,
3,1582508,LD327,37,HKG,SIN,LD,3,0.1,2013-09-15,0,...,0.233333,0.172782,0.108333,0.404545,3.0,,,,,
4,1582509,KA5390,40,HKG,PEK,KA,9,0.5,2015-10-05,0,...,2.155556,0.669462,0.821120,0.639538,72.0,4.0,14.0,2.0,,1.0
5,1582511,NZ4851,10,HKG,IST,NZ,23,0.2,2015-03-10,0,...,0.027273,0.308513,0.250000,0.117568,14.0,,,2.0,,
6,1582512,CX5626,51,HKG,HGH,CX,7,0.4,2015-12-19,0,...,0.807018,0.678668,0.646000,0.704729,1.0,,,,,
7,1582513,MH9725,8,HKG,KUL,MH,12,0.1,2014-02-23,0,...,-0.008333,0.265036,0.107647,0.096053,33.0,,,5.0,,
8,1582516,KA154,42,HKG,BLR,KA,21,0.0,2014-10-21,0,...,0.170312,0.179385,0.457143,0.144291,49.0,,3.0,,,
9,1582517,CX233,10,HKG,MXP,CX,0,0.3,2015-03-05,0,...,0.120103,0.308513,0.271429,0.239143,42.0,,5.0,,,


In [59]:
merged_feature_df['is_claim'].value_counts()

0      859701
800     39413
Name: is_claim, dtype: int64

In [60]:
merged_feature_df.dropna()['is_claim'].value_counts()

0      3480
800     572
Name: is_claim, dtype: int64

In [61]:
merged_feature_df.to_csv('../datasets/flight_delays_data_transformed.csv')

In [157]:
merged_feature_df.fillna(0)

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,...,flight_month,flight_day,flight_hour_bin,flight_ts,dep_hr_delay_time,arr_hr_delay_time,air_hr_delay_time,dep_hr_cancel_count,arr_hr_cancel_count,air_hr_cancel_count
0,1582499,UO686,27,HKG,KIX,UO,10,0.4,2016-07-01,0,...,7,1,2,2016070110,0.483673,0.200000,0.300000,0.0,0.0,0.0
1,1582501,CI7868,17,HKG,TNN,CI,11,0.5,2015-04-23,0,...,4,23,2,2015042311,0.238889,0.500000,0.266667,2.0,0.0,0.0
2,1582504,PR301,14,HKG,MNL,PR,11,0.0,2014-04-08,0,...,4,8,2,2014040811,0.216216,0.000000,0.000000,0.0,0.0,0.0
3,1582508,LD327,37,HKG,SIN,LD,3,0.1,2013-09-15,0,...,9,15,0,2013091503,0.125000,0.100000,0.125000,0.0,0.0,0.0
4,1582509,KA5390,40,HKG,PEK,KA,9,0.5,2015-10-05,0,...,10,5,2,2015100509,1.359322,0.500000,1.366667,0.0,0.0,0.0
5,1582511,NZ4851,10,HKG,IST,NZ,23,0.2,2015-03-10,0,...,3,10,5,2015031023,0.276471,0.200000,0.150000,3.0,0.0,0.0
6,1582512,CX5626,51,HKG,HGH,CX,7,0.4,2015-12-19,0,...,12,19,1,2015121907,0.520000,0.400000,0.580000,0.0,0.0,0.0
7,1582513,MH9725,8,HKG,KUL,MH,12,0.1,2014-02-23,0,...,2,23,3,2014022312,0.106977,0.100000,0.100000,3.0,0.0,0.0
8,1582516,KA154,42,HKG,BLR,KA,21,0.0,2014-10-21,0,...,10,21,5,2014102121,0.268571,0.000000,0.100000,0.0,0.0,0.0
9,1582517,CX233,10,HKG,MXP,CX,0,0.3,2015-03-05,0,...,3,5,0,2015030500,0.117647,0.300000,0.116667,0.0,0.0,0.0


In [44]:
# Departure + day
dep_day_delay_df = non_cancel_df.groupby(['Departure', 'flight_year', 'flight_month', 'flight_day']).mean()['delay_time'].reset_index()

In [None]:
# Arrival + hour
arr_day_delay_df = non_cancel_df.groupby(['Arrival', 'flight_year', 'flight_month', 'flight_day', 'std_hour']).mean()['delay_time'].reset_index()

In [40]:
test_df[(test_df['flight_month'] == 9) & (test_df['flight_day'] > 21)]

Unnamed: 0,Departure,flight_year,flight_month,flight_day,std_hour,flight_id,Week,delay_time,is_claim,flight_hour_bin
459,HKG,2013,9,22,0,1.245541e+06,38.0,0.188889,0.000000,0.0
460,HKG,2013,9,22,1,1.271148e+06,38.0,0.065000,0.000000,0.0
461,HKG,2013,9,22,2,1.228075e+06,38.0,0.033333,0.000000,0.0
462,HKG,2013,9,22,3,1.437141e+06,38.0,0.000000,0.000000,0.0
463,HKG,2013,9,22,4,1.479844e+06,38.0,-0.080000,0.000000,1.0
464,HKG,2013,9,22,5,7.759900e+05,38.0,0.100000,0.000000,1.0
465,HKG,2013,9,22,7,1.418436e+06,38.0,0.023077,0.000000,1.0
466,HKG,2013,9,22,8,1.451535e+06,38.0,0.132812,0.000000,2.0
467,HKG,2013,9,22,9,1.408648e+06,38.0,0.423256,0.000000,2.0
468,HKG,2013,9,22,10,1.380346e+06,38.0,0.360870,0.000000,2.0
