Upon data study, this notebook will create new columns from flight delay claim dataset, that could be used as features in the prediction model.

In [44]:
# Used libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import calendar

In [23]:
# Load the dataset
data_df = pd.read_csv('../datasets/flight_delays_data.csv')

# Check data size
data_df.shape

(899114, 10)

In [24]:
# Show some sample data
data_df.head()

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim
0,1582499,UO686,27,HKG,KIX,UO,10,0.4,2016-07-01,0
1,1582501,CI7868,17,HKG,TNN,CI,11,0.5,2015-04-23,0
2,1582504,PR301,14,HKG,MNL,PR,11,0.0,2014-04-08,0
3,1582508,LD327,37,HKG,SIN,LD,3,0.1,2013-09-15,0
4,1582509,KA5390,40,HKG,PEK,KA,9,0.5,2015-10-05,0


In [53]:
# For Airline = NaN, fill the value from flight_no
def fill_missing_airline(row):
    if not isinstance(row['Airline'], str) and np.isnan(row['Airline']):
        row['Airline'] = row['flight_no'][:2]
    return row
    
data_df = data_df.apply(fill_missing_airline, axis=1)

From the study, we are going to create different types of statistics that help with the prediction:

# Delay hours statistics

From the study, we are going to create statistics of delay hours from different perspectives:
- Departure
- Arrival
- Airline
- Arrival + Airline

Below are not tested because of the fixed Departure airport:
- Departure + Arrival
- Departure + Airline
- Departure + Arrival + Airline

For each perspective, the average delay_time of per (hour/day/week) is computed. In addition, counts of delay/cancelled flights are created per different times.

In [4]:
# Get flight datetime-related columns
data_df['flight_date_dt'] = data_df['flight_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
data_df['flight_dt'] = data_df.apply(lambda x: x['flight_date_dt'] + timedelta(hours=x['std_hour']), axis=1)
data_df['flight_year'] = data_df['flight_dt'].apply(lambda x: x.year)
data_df['flight_month'] = data_df['flight_dt'].apply(lambda x: x.month)
data_df['flight_day'] = data_df['flight_dt'].apply(lambda x: x.day)
data_df['flight_2_day_bin'] = data_df['flight_dt'].apply(lambda x: x.day // 2)
data_df['flight_4_day_bin'] = data_df['flight_dt'].apply(lambda x: x.day // 4)
data_df['flight_2_hour_bin'] = data_df['std_hour'].apply(lambda x: x // 2)
data_df['flight_4_hour_bin'] = data_df['std_hour'].apply(lambda x: x // 4)

In [5]:
data_df.sample(10)

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,flight_date_dt,flight_dt,flight_year,flight_month,flight_day,flight_2_day_bin,flight_4_day_bin,flight_2_hour_bin,flight_4_hour_bin
818316,2541969,UO182,46,HKG,RMQ,UO,11,0.5,2013-11-14,0,2013-11-14,2013-11-14 11:00:00,2013,11,14,7,3,5,2
729839,2267877,3U8618,6,HKG,CTU,3U,17,0.3,2014-02-10,0,2014-02-10,2014-02-10 17:00:00,2014,2,10,5,2,8,4
56537,175166,CX1727,44,HKG,KUL,CX,14,0.0,2014-11-04,0,2014-11-04,2014-11-04 14:00:00,2014,11,4,2,1,7,3
842690,2617895,SQ857,2,HKG,SIN,SQ,9,0.5,2016-01-08,0,2016-01-08,2016-01-08 09:00:00,2016,1,8,4,2,4,2
578267,1799094,CX749,44,HKG,JNB,CX,23,0.0,2014-11-01,0,2014-11-01,2014-11-01 23:00:00,2014,11,1,0,0,11,5
521760,1623782,KE616,1,HKG,PUS,KE,2,-0.2,2015-01-04,0,2015-01-04,2015-01-04 02:00:00,2015,1,4,2,1,1,0
94951,294558,HX173,20,HKG,FOC,HX,22,0.4,2015-05-14,0,2015-05-14,2015-05-14 22:00:00,2015,5,14,7,3,11,5
156691,485852,HX1852,9,HKG,TPE,HX,11,-0.1,2015-03-03,0,2015-03-03,2015-03-03 11:00:00,2015,3,3,1,0,5,2
229852,714117,VN3563,9,HKG,SGN,VN,8,0.5,2016-02-27,0,2016-02-27,2016-02-27 08:00:00,2016,2,27,13,6,4,2
174970,542828,CX695,19,HKG,DEL,CX,17,0.5,2016-05-11,0,2016-05-11,2016-05-11 17:00:00,2016,5,11,5,2,8,4


In [6]:
# Before processing delay_time statistics, remove cancalled entries first
non_cancel_df = data_df[~(data_df['delay_time'] == "Cancelled")].copy()

# Translate delay_time to float
non_cancel_df['delay_time'] = non_cancel_df['delay_time'].apply(lambda x: float(x))

# Note: groupby preserves the order of rows within each group
non_cancel_df = non_cancel_df.sort_values(['Departure', 'Arrival', 'Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour'],ascending=False)

In [7]:
# Aggregate per different perspectives by different bins
perspectives = {'dep': ['Departure'],
                'arr': ['Arrival'],
                'air': ['Airline'],
                'arr_air': ['Arrival', 'Airline']}

time_bins = {'hr': ['flight_year', 'flight_month', 'flight_day', 'std_hour'],
             '2_hr': ['flight_year', 'flight_month', 'flight_day', 'flight_2_hour_bin'],
             '4_hr': ['flight_year', 'flight_month', 'flight_day', 'flight_4_hour_bin'],
             'day': ['flight_year', 'flight_month', 'flight_day'],
             '2_day': ['flight_year', 'flight_month', 'flight_2_day_bin'],
             '4_day': ['flight_year', 'flight_month', 'flight_4_day_bin'],
             'wk': ['flight_year', 'Week']}

# Creation of different stat dfs
perspective_time_dfs = {}
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        perspective_time = p + t
        perspective_time_dfs[pt_key] = non_cancel_df.groupby(perspective_time).mean()['delay_time'].reset_index()

In [8]:
# Helper functions for getting timestamp values by different bins
def get_hr_ts_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    std_hour = int(row['std_hour'])
    return flight_year * (10 ** 6) + flight_month * (10 ** 4) + flight_day * (10 ** 2) + std_hour

def get_2_hr_ts_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    flight_2_hour_bin = row['flight_2_hour_bin']
    return flight_year * (10 ** 6) + flight_month * (10 ** 4) + flight_day * (10 ** 2) + flight_2_hour_bin

def get_4_hr_ts_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    flight_4_hour_bin = row['flight_4_hour_bin']
    return flight_year * (10 ** 6) + flight_month * (10 ** 4) + flight_day * (10 ** 2) + flight_4_hour_bin

def get_day_ts_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    return flight_year * (10 ** 4) + flight_month * (10 ** 2) + flight_day

def get_2_day_ts_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_2_day_bin']
    return flight_year * (10 ** 4) + flight_month * (10 ** 2) + flight_day

def get_4_day_ts_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_4_day_bin']
    return flight_year * (10 ** 4) + flight_month * (10 ** 2) + flight_day

def get_wk_ts_val(row):
    flight_year = row['flight_year']
    flight_wk = row['Week']
    return flight_year * (10 ** 2) + flight_wk

In [9]:
# For each stat df, create a time series key such that it is easier to select "most recent" value in later stage
# This key will not become a feature in model training

for pt_key in perspective_time_dfs:
    if pt_key.endswith('_2_hr'):
        perspective_time_dfs[pt_key]['flight_2_hour_ts'] = perspective_time_dfs[pt_key].apply(get_2_hr_ts_val, axis=1)
    elif pt_key.endswith('_4_hr'):
        perspective_time_dfs[pt_key]['flight_4_hour_ts'] = perspective_time_dfs[pt_key].apply(get_4_hr_ts_val, axis=1)
    elif pt_key.endswith('_2_day'):
        perspective_time_dfs[pt_key]['flight_2_day_ts'] = perspective_time_dfs[pt_key].apply(get_2_day_ts_val, axis=1)
    elif pt_key.endswith('_4_day'):
        perspective_time_dfs[pt_key]['flight_4_day_ts'] = perspective_time_dfs[pt_key].apply(get_4_day_ts_val, axis=1)
    elif pt_key.endswith('_day'):
        perspective_time_dfs[pt_key]['flight_day_ts'] = perspective_time_dfs[pt_key].apply(get_day_ts_val, axis=1)
    elif pt_key.endswith('_wk'):
        perspective_time_dfs[pt_key]['flight_wk_ts'] = perspective_time_dfs[pt_key].apply(get_wk_ts_val, axis=1)
    elif pt_key.endswith('_hr'):
        perspective_time_dfs[pt_key]['flight_ts'] = perspective_time_dfs[pt_key].apply(get_hr_ts_val, axis=1)

    # Rename column for join operation
    perspective_time_dfs[pt_key].rename(columns={'delay_time': '_'.join([pt_key, 'delay_time'])}, inplace=True)

In [10]:
delay_time_dfs = perspective_time_dfs.copy()

In [11]:
# Do the same, but for having delay only times
# Before processing delay_time statistics, remove cancalled entries first
non_cancel_df = data_df[~(data_df['delay_time'] == "Cancelled")].copy()

# Translate delay_time to float
non_cancel_df['delay_time'] = non_cancel_df['delay_time'].apply(lambda x: float(x) if float(x) >= 3.0 else -100)
have_delay_df = non_cancel_df[non_cancel_df['delay_time'] >= 3.0]

# Note: groupby preserves the order of rows within each group
have_delay_df = have_delay_df.sort_values(['Departure', 'Arrival', 'Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour'],ascending=False)

In [12]:
# Creation of different stat dfs
perspective_time_dfs = {}
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        perspective_time = p + t
        perspective_time_dfs[pt_key] = have_delay_df.groupby(perspective_time).mean()['delay_time'].reset_index()

In [13]:
# For each stat df, create a time series key such that it is easier to select "most recent" value in later stage
# This key will not become a feature in model training
for pt_key in perspective_time_dfs:
    if pt_key.endswith('_2_hr'):
        perspective_time_dfs[pt_key]['flight_2_hour_ts'] = perspective_time_dfs[pt_key].apply(get_2_hr_ts_val, axis=1)
    elif pt_key.endswith('_4_hr'):
        perspective_time_dfs[pt_key]['flight_4_hour_ts'] = perspective_time_dfs[pt_key].apply(get_4_hr_ts_val, axis=1)
    elif pt_key.endswith('_2_day'):
        perspective_time_dfs[pt_key]['flight_2_day_ts'] = perspective_time_dfs[pt_key].apply(get_2_day_ts_val, axis=1)
    elif pt_key.endswith('_4_day'):
        perspective_time_dfs[pt_key]['flight_4_day_ts'] = perspective_time_dfs[pt_key].apply(get_4_day_ts_val, axis=1)
    elif pt_key.endswith('_day'):
        perspective_time_dfs[pt_key]['flight_day_ts'] = perspective_time_dfs[pt_key].apply(get_day_ts_val, axis=1)
    elif pt_key.endswith('_wk'):
        perspective_time_dfs[pt_key]['flight_wk_ts'] = perspective_time_dfs[pt_key].apply(get_wk_ts_val, axis=1)
    elif pt_key.endswith('_hr'):
        perspective_time_dfs[pt_key]['flight_ts'] = perspective_time_dfs[pt_key].apply(get_hr_ts_val, axis=1)

    # Rename column for join operation
    perspective_time_dfs[pt_key].rename(columns={'delay_time': '_'.join([pt_key, 'delay_only_time'])}, inplace=True)

In [14]:
delay_only_time_dfs = perspective_time_dfs.copy()

# Delay status statistics

In [15]:
# Before processing delay_time statistics, remove cancalled entries first
non_cancel_df = data_df[~(data_df['delay_time'] == "Cancelled")].copy()

# Translate delay_time to float
non_cancel_df['is_delayed'] = non_cancel_df['delay_time'].apply(lambda x: float(x) >= 3.0)

# Note: groupby preserves the order of rows within each group
non_cancel_df = non_cancel_df.sort_values(['Departure', 'Arrival', 'Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour'],ascending=False)

In [16]:
# Creation of different stat dfs
perspective_time_dfs = {}
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        perspective_time = p + t
        perspective_time_dfs[pt_key] = non_cancel_df.groupby(perspective_time).sum()['is_delayed'].reset_index()

In [17]:
# For each stat df, create a time series key such that it is easier to select "most recent" value in later stage
# This key will not become a feature in model training
for pt_key in perspective_time_dfs:
    if pt_key.endswith('_2_hr'):
        perspective_time_dfs[pt_key]['flight_2_hour_ts'] = perspective_time_dfs[pt_key].apply(get_2_hr_ts_val, axis=1)
    elif pt_key.endswith('_4_hr'):
        perspective_time_dfs[pt_key]['flight_4_hour_ts'] = perspective_time_dfs[pt_key].apply(get_4_hr_ts_val, axis=1)
    elif pt_key.endswith('_2_day'):
        perspective_time_dfs[pt_key]['flight_2_day_ts'] = perspective_time_dfs[pt_key].apply(get_2_day_ts_val, axis=1)
    elif pt_key.endswith('_4_day'):
        perspective_time_dfs[pt_key]['flight_4_day_ts'] = perspective_time_dfs[pt_key].apply(get_4_day_ts_val, axis=1)
    elif pt_key.endswith('_day'):
        perspective_time_dfs[pt_key]['flight_day_ts'] = perspective_time_dfs[pt_key].apply(get_day_ts_val, axis=1)
    elif pt_key.endswith('_wk'):
        perspective_time_dfs[pt_key]['flight_wk_ts'] = perspective_time_dfs[pt_key].apply(get_wk_ts_val, axis=1)
    elif pt_key.endswith('_hr'):
        perspective_time_dfs[pt_key]['flight_ts'] = perspective_time_dfs[pt_key].apply(get_hr_ts_val, axis=1)

    # Rename column for join operation
    perspective_time_dfs[pt_key].rename(columns={'is_delayed': '_'.join([pt_key, 'delay_count'])}, inplace=True)

In [18]:
delay_count_dfs = perspective_time_dfs.copy()

# Cancelled status statistics

In [19]:
# Before processing delay_time statistics, remove cancalled entries first
cancel_df = data_df[(data_df['delay_time'] == "Cancelled")].copy()

# Translate delay_time to float
cancel_df['is_cancel'] = cancel_df['delay_time'].apply(lambda x: x == "Cancelled")

# Note: groupby preserves the order of rows within each group
cancel_df = cancel_df.sort_values(['Departure', 'Arrival', 'Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour'],ascending=False)

In [20]:
# Creation of different stat dfs
perspective_time_dfs = {}
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        perspective_time = p + t
        perspective_time_dfs[pt_key] = cancel_df.groupby(perspective_time).sum()['is_cancel'].reset_index()

In [21]:
# For each stat df, create a time series key such that it is easier to select "most recent" value in later stage
# This key will not become a feature in model training
for pt_key in perspective_time_dfs:
    if pt_key.endswith('_2_hr'):
        perspective_time_dfs[pt_key]['flight_2_hour_ts'] = perspective_time_dfs[pt_key].apply(get_2_hr_ts_val, axis=1)
    elif pt_key.endswith('_4_hr'):
        perspective_time_dfs[pt_key]['flight_4_hour_ts'] = perspective_time_dfs[pt_key].apply(get_4_hr_ts_val, axis=1)
    elif pt_key.endswith('_2_day'):
        perspective_time_dfs[pt_key]['flight_2_day_ts'] = perspective_time_dfs[pt_key].apply(get_2_day_ts_val, axis=1)
    elif pt_key.endswith('_4_day'):
        perspective_time_dfs[pt_key]['flight_4_day_ts'] = perspective_time_dfs[pt_key].apply(get_4_day_ts_val, axis=1)
    elif pt_key.endswith('_day'):
        perspective_time_dfs[pt_key]['flight_day_ts'] = perspective_time_dfs[pt_key].apply(get_day_ts_val, axis=1)
    elif pt_key.endswith('_wk'):
        perspective_time_dfs[pt_key]['flight_wk_ts'] = perspective_time_dfs[pt_key].apply(get_wk_ts_val, axis=1)
    elif pt_key.endswith('_hr'):
        perspective_time_dfs[pt_key]['flight_ts'] = perspective_time_dfs[pt_key].apply(get_hr_ts_val, axis=1)

    # Rename column for join operation
    perspective_time_dfs[pt_key].rename(columns={'is_cancel': '_'.join([pt_key, 'cancel_count'])}, inplace=True)

In [22]:
cancel_count_dfs = perspective_time_dfs.copy()

Merging the statistics to original dataset ...

In [46]:
feature_df = data_df.copy()

In [47]:
# Use lagged time series value for below columns creation
# Thus, the stat values should be available upon actual prediction
def get_last_time_series_val(row):
    flight_dt = row['flight_dt'] - timedelta(hours=1)
    return int(flight_dt.strftime("%Y%m%d%H"))

feature_df['flight_ts'] = feature_df.apply(get_last_time_series_val, axis=1)

In [48]:
def get_last_day_time_series_val(row):
    flight_dt = row['flight_dt']
    flight_hour_bin = row['flight_2_hour_bin']
    flight_hour_bin -= 1
    if flight_hour_bin < 0:
        flight_dt = row['flight_dt'] - timedelta(days=1)
        flight_hour_bin = 11
    return int(flight_dt.strftime("%Y%m%d")) * 100 + flight_hour_bin


feature_df['flight_2_hour_ts'] = feature_df.apply(get_last_day_time_series_val, axis=1)

In [49]:
def get_last_day_time_series_val(row):
    flight_dt = row['flight_dt']
    flight_hour_bin = row['flight_4_hour_bin']
    flight_hour_bin -= 1
    if flight_hour_bin < 0:
        flight_dt = row['flight_dt'] - timedelta(days=1)
        flight_hour_bin = 5
    return int(flight_dt.strftime("%Y%m%d")) * 100 + flight_hour_bin


feature_df['flight_4_hour_ts'] = feature_df.apply(get_last_day_time_series_val, axis=1)

In [50]:
# Do the same for day/week
def get_last_day_time_series_val(row):
    flight_dt = row['flight_dt'] - timedelta(days=1)
    return int(flight_dt.strftime("%Y%m%d"))

feature_df['flight_day_ts'] = feature_df.apply(get_last_day_time_series_val, axis=1)

In [51]:
def get_last_day_time_series_val(row):
    flight_dt = row['flight_dt']
    flight_day_bin = row['flight_2_day_bin']
    flight_day_bin -= 1
    if flight_day_bin < 0:
        flight_dt = row['flight_dt'] - timedelta(days=20)   # Approx. for getting last month value
        flight_day_bin = calendar.monthrange(flight_dt.year, flight_dt.month)[1] // 2
    return int(flight_dt.strftime("%Y%m")) * 100 + flight_day_bin

feature_df['flight_2_day_ts'] = feature_df.apply(get_last_day_time_series_val, axis=1)

In [52]:
def get_last_day_time_series_val(row):
    flight_dt = row['flight_dt']
    flight_day_bin = row['flight_4_day_bin']
    flight_day_bin -= 1
    if flight_day_bin < 0:
        flight_dt = row['flight_dt'] - timedelta(days=20)   # Approx. for getting last month value
        flight_day_bin = calendar.monthrange(flight_dt.year, flight_dt.month)[1] // 4
    return int(flight_dt.strftime("%Y%m")) * 100 + flight_day_bin

feature_df['flight_4_day_ts'] = feature_df.apply(get_last_day_time_series_val, axis=1)

In [53]:
def get_number_of_weeks_in_year(year):
    last_week = datetime(year, 12, 28)
    return last_week.isocalendar()[1]

def get_last_wk_time_series_val(row):
    flight_year = row['flight_year']
    flight_week = row['Week'] - 1
    
    if flight_week < 1:
        # Shift to last year end
        flight_year -= 1
        flight_week = get_number_of_weeks_in_year(flight_year)
        
    return flight_year * (10 ** 2) + flight_week

feature_df['flight_wk_ts'] = feature_df.apply(get_last_wk_time_series_val, axis=1)

In [54]:
merged_feature_df = feature_df.copy()
merged_feature_df

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,...,flight_4_day_bin,flight_2_hour_bin,flight_4_hour_bin,flight_ts,flight_2_hour_ts,flight_4_hour_ts,flight_day_ts,flight_2_day_ts,flight_4_day_ts,flight_wk_ts
0,1582499,UO686,27,HKG,KIX,UO,10,0.4,2016-07-01,0,...,0,5,2,2016070109,2016070104,2016070101,20160630,20160615,20160607,201626
1,1582501,CI7868,17,HKG,TNN,CI,11,0.5,2015-04-23,0,...,5,5,2,2015042310,2015042304,2015042301,20150422,20150410,20150404,201516
2,1582504,PR301,14,HKG,MNL,PR,11,0.0,2014-04-08,0,...,2,5,2,2014040810,2014040804,2014040801,20140407,20140403,20140401,201413
3,1582508,LD327,37,HKG,SIN,LD,3,0.1,2013-09-15,0,...,3,1,0,2013091502,2013091500,2013091405,20130914,20130906,20130902,201336
4,1582509,KA5390,40,HKG,PEK,KA,9,0.5,2015-10-05,0,...,1,4,2,2015100508,2015100503,2015100501,20151004,20151001,20151000,201539
5,1582511,NZ4851,10,HKG,IST,NZ,23,0.2,2015-03-10,0,...,2,11,5,2015031022,2015031010,2015031004,20150309,20150304,20150301,201509
6,1582512,CX5626,51,HKG,HGH,CX,7,0.4,2015-12-19,0,...,4,3,1,2015121906,2015121902,2015121900,20151218,20151208,20151203,201550
7,1582513,MH9725,8,HKG,KUL,MH,12,0.1,2014-02-23,0,...,5,6,3,2014022311,2014022305,2014022302,20140222,20140210,20140204,201407
8,1582516,KA154,42,HKG,BLR,KA,21,0.0,2014-10-21,0,...,5,10,5,2014102120,2014102109,2014102104,20141020,20141009,20141004,201441
9,1582517,CX233,10,HKG,MXP,CX,0,0.3,2015-03-05,0,...,1,0,0,2015030423,2015030411,2015030405,20150304,20150301,20150300,201509


In [55]:
# Merge generated stat to original dataset
# The merge operation is based on last hr/day/wk from current row's datetime, thus it is assumed those statistics can be calculated for new predictions

# Delay time
perspective_time_dfs = delay_time_dfs
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        
        if pt_key.endswith('_2_hr'):
            ts_val = 'flight_2_hour_ts'
        elif pt_key.endswith('_4_hr'):
            ts_val = 'flight_4_hour_ts'
        elif pt_key.endswith('_2_day'):
            ts_val = 'flight_2_day_ts'
        elif pt_key.endswith('_4_day'):
            ts_val = 'flight_4_day_ts'
        elif pt_key.endswith('_day'):
            ts_val = 'flight_day_ts'
        elif pt_key.endswith('_wk'):
            ts_val = 'flight_wk_ts'
        elif pt_key.endswith('_hr'):
            ts_val = 'flight_ts'
        
        perspective_time = p + [ts_val]
        to_add_col = '_'.join([pt_key, 'delay_time'])
        to_merge_cols = p + [ts_val] + [to_add_col]
        to_merge_df = perspective_time_dfs[pt_key][to_merge_cols]
        merged_feature_df = merged_feature_df.merge(to_merge_df, how='left', left_on=perspective_time, right_on=perspective_time)

In [56]:
# Delay only time
perspective_time_dfs = delay_only_time_dfs
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        
        if pt_key.endswith('_2_hr'):
            ts_val = 'flight_2_hour_ts'
        elif pt_key.endswith('_4_hr'):
            ts_val = 'flight_4_hour_ts'
        elif pt_key.endswith('_2_day'):
            ts_val = 'flight_2_day_ts'
        elif pt_key.endswith('_4_day'):
            ts_val = 'flight_4_day_ts'
        elif pt_key.endswith('_day'):
            ts_val = 'flight_day_ts'
        elif pt_key.endswith('_wk'):
            ts_val = 'flight_wk_ts'
        elif pt_key.endswith('_hr'):
            ts_val = 'flight_ts'
        
        perspective_time = p + [ts_val]
        to_add_col = '_'.join([pt_key, 'delay_only_time'])
        to_merge_cols = p + [ts_val] + [to_add_col]
        to_merge_df = perspective_time_dfs[pt_key][to_merge_cols]
        merged_feature_df = merged_feature_df.merge(to_merge_df, how='left', left_on=perspective_time, right_on=perspective_time)

In [57]:
# Delay count
perspective_time_dfs = delay_count_dfs
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        
        if pt_key.endswith('_2_hr'):
            ts_val = 'flight_2_hour_ts'
        elif pt_key.endswith('_4_hr'):
            ts_val = 'flight_4_hour_ts'
        elif pt_key.endswith('_2_day'):
            ts_val = 'flight_2_day_ts'
        elif pt_key.endswith('_4_day'):
            ts_val = 'flight_4_day_ts'
        elif pt_key.endswith('_day'):
            ts_val = 'flight_day_ts'
        elif pt_key.endswith('_wk'):
            ts_val = 'flight_wk_ts'
        elif pt_key.endswith('_hr'):
            ts_val = 'flight_ts'
        
        perspective_time = p + [ts_val]
        to_add_col = '_'.join([pt_key, 'delay_count'])
        to_merge_cols = p + [ts_val] + [to_add_col]
        to_merge_df = perspective_time_dfs[pt_key][to_merge_cols]
        merged_feature_df = merged_feature_df.merge(to_merge_df, how='left', left_on=perspective_time, right_on=perspective_time)

In [58]:
# Cancel count
perspective_time_dfs = cancel_count_dfs
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        
        if pt_key.endswith('_2_hr'):
            ts_val = 'flight_2_hour_ts'
        elif pt_key.endswith('_4_hr'):
            ts_val = 'flight_4_hour_ts'
        elif pt_key.endswith('_2_day'):
            ts_val = 'flight_2_day_ts'
        elif pt_key.endswith('_4_day'):
            ts_val = 'flight_4_day_ts'
        elif pt_key.endswith('_day'):
            ts_val = 'flight_day_ts'
        elif pt_key.endswith('_wk'):
            ts_val = 'flight_wk_ts'
        elif pt_key.endswith('_hr'):
            ts_val = 'flight_ts'
        
        perspective_time = p + [ts_val]
        to_add_col = '_'.join([pt_key, 'cancel_count'])
        to_merge_cols = p + [ts_val] + [to_add_col]
        to_merge_df = perspective_time_dfs[pt_key][to_merge_cols]
        merged_feature_df = merged_feature_df.merge(to_merge_df, how='left', left_on=perspective_time, right_on=perspective_time)

In [38]:
merged_feature_df.columns

Index(['flight_id', 'flight_no', 'Week', 'Departure', 'Arrival', 'Airline',
       'std_hour', 'delay_time', 'flight_date', 'is_claim',
       ...
       'air_2_day_cancel_count', 'air_4_day_cancel_count',
       'air_wk_cancel_count', 'arr_air_hr_cancel_count',
       'arr_air_2_hr_cancel_count', 'arr_air_4_hr_cancel_count',
       'arr_air_day_cancel_count', 'arr_air_2_day_cancel_count',
       'arr_air_4_day_cancel_count', 'arr_air_wk_cancel_count'],
      dtype='object', length=142)

In [60]:
# For delay/cancel count, fill NaN values with 0 (i.e. No delay/cancel flights)
count_cols = [col_str for col_str in list(merged_feature_df.columns) if col_str.endswith('_count')]

merged_feature_df[count_cols] = merged_feature_df[count_cols].fillna(0)

In [61]:
# Show final result, and save the transformed dataframe
merged_feature_df

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,...,air_2_day_cancel_count,air_4_day_cancel_count,air_wk_cancel_count,arr_air_hr_cancel_count,arr_air_2_hr_cancel_count,arr_air_4_hr_cancel_count,arr_air_day_cancel_count,arr_air_2_day_cancel_count,arr_air_4_day_cancel_count,arr_air_wk_cancel_count
0,1582499,UO686,27,HKG,KIX,UO,10,0.4,2016-07-01,0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1582501,CI7868,17,HKG,TNN,CI,11,0.5,2015-04-23,0,...,4.0,7.0,15.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0
2,1582504,PR301,14,HKG,MNL,PR,11,0.0,2014-04-08,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1582508,LD327,37,HKG,SIN,LD,3,0.1,2013-09-15,0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1582509,KA5390,40,HKG,PEK,KA,9,0.5,2015-10-05,0,...,14.0,16.0,32.0,0.0,0.0,0.0,5.0,1.0,1.0,0.0
5,1582511,NZ4851,10,HKG,IST,NZ,23,0.2,2015-03-10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1582512,CX5626,51,HKG,HGH,CX,7,0.4,2015-12-19,0,...,5.0,15.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1582513,MH9725,8,HKG,KUL,MH,12,0.1,2014-02-23,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1582516,KA154,42,HKG,BLR,KA,21,0.0,2014-10-21,0,...,3.0,5.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1582517,CX233,10,HKG,MXP,CX,0,0.3,2015-03-05,0,...,4.0,4.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
merged_feature_df.to_csv('../datasets/flight_delays_data_transformed_new.csv')