Upon data study, this notebook will create new columns from flight delay claim dataset, that could be used as features in the prediction model.

In [1]:
# Used libraries
import pandas as pd
from datetime import datetime, timedelta

In [2]:
# Load the dataset
data_df = pd.read_csv('../datasets/flight_delays_data.csv')

# Check data size
data_df.shape

(899114, 10)

In [3]:
# Show some sample data
data_df.head()

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim
0,1582499,UO686,27,HKG,KIX,UO,10,0.4,2016-07-01,0
1,1582501,CI7868,17,HKG,TNN,CI,11,0.5,2015-04-23,0
2,1582504,PR301,14,HKG,MNL,PR,11,0.0,2014-04-08,0
3,1582508,LD327,37,HKG,SIN,LD,3,0.1,2013-09-15,0
4,1582509,KA5390,40,HKG,PEK,KA,9,0.5,2015-10-05,0


From the study, we are going to create different types of statistics that help with the prediction:

# Delay hours statistics

From the study, we are going to create statistics of delay hours from different perspectives:
- Departure
- Arrival
- Airline
- Arrival + Airline

Below are not tested because of the fixed Departure airport:
- Departure + Arrival
- Departure + Airline
- Departure + Arrival + Airline

For each perspective, the average delay_time of per (hour/day/week) is computed. In addition, counts of delay/cancelled flights are created per different times.

In [4]:
# Get flight datetime-related columns
data_df['flight_date_dt'] = data_df['flight_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
data_df['flight_dt'] = data_df.apply(lambda x: x['flight_date_dt'] + timedelta(hours=x['std_hour']), axis=1)
data_df['flight_year'] = data_df['flight_dt'].apply(lambda x: x.year)
data_df['flight_month'] = data_df['flight_dt'].apply(lambda x: x.month)
data_df['flight_day'] = data_df['flight_dt'].apply(lambda x: x.day)
data_df['flight_2_hour_bin'] = data_df['std_hour'].apply(lambda x: x // 2)
data_df['flight_4_hour_bin'] = data_df['std_hour'].apply(lambda x: x // 4)

In [5]:
data_df.sample(10)

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,flight_date_dt,flight_dt,flight_year,flight_month,flight_day,flight_2_hour_bin,flight_4_hour_bin
698783,2171914,EK383,52,HKG,DXB,EK,17,0.3,2013-12-30,0,2013-12-30,2013-12-30 17:00:00,2013,12,30,8,4
187244,580774,S7546,38,HKG,VVO,S7,16,0.0,2013-09-20,0,2013-09-20,2013-09-20 16:00:00,2013,9,20,8,4
487812,1517509,CX466,26,HKG,TPE,CX,13,-0.1,2015-06-29,0,2015-06-29,2015-06-29 13:00:00,2015,6,29,6,3
535576,1666592,EY7121,3,HKG,CGK,EY,17,0.3,2014-01-16,0,2014-01-16,2014-01-16 17:00:00,2014,1,16,8,4
450150,1400199,CI922,26,HKG,TPE,CI,8,-0.1,2014-06-29,0,2014-06-29,2014-06-29 08:00:00,2014,6,29,4,2
33163,102534,MU8519,20,HKG,HGH,MU,16,1.5,2014-05-17,0,2014-05-17,2014-05-17 16:00:00,2014,5,17,8,4
551323,1715535,KA786,24,HKG,CAN,KA,20,1.3,2016-06-12,0,2016-06-12,2016-06-12 20:00:00,2016,6,12,10,5
579963,1804392,KA740,33,HKG,CGO,KA,12,0.2,2014-08-15,0,2014-08-15,2014-08-15 12:00:00,2014,8,15,6,3
376159,1170533,GA863,40,HKG,CGK,GA,17,0.4,2015-10-07,0,2015-10-07,2015-10-07 17:00:00,2015,10,7,8,4
581673,1809751,QR5844,37,HKG,NRT,QR,1,0.3,2015-09-15,0,2015-09-15,2015-09-15 01:00:00,2015,9,15,0,0


In [6]:
# Before processing delay_time statistics, remove cancalled entries first
non_cancel_df = data_df[~(data_df['delay_time'] == "Cancelled")].copy()

# Translate delay_time to float
non_cancel_df['delay_time'] = non_cancel_df['delay_time'].apply(lambda x: float(x))

# Note: groupby preserves the order of rows within each group
non_cancel_df = non_cancel_df.sort_values(['Departure', 'Arrival', 'Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour'],ascending=False)

In [7]:
# Aggregate per different perspectives by different bins
perspectives = {'dep': ['Departure'],
                'arr': ['Arrival'],
                'air': ['Airline'],
                'arr_air': ['Arrival', 'Airline']}

time_bins = {'hr': ['flight_year', 'flight_month', 'flight_day', 'std_hour'],
             '2_hr': ['flight_year', 'flight_month', 'flight_day', 'flight_2_hour_bin'],
             '4_hr': ['flight_year', 'flight_month', 'flight_day', 'flight_4_hour_bin'],
             'day': ['flight_year', 'flight_month', 'flight_day'],
             'wk': ['flight_year', 'Week']}

# Creation of different stat dfs
perspective_time_dfs = {}
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        perspective_time = p + t
        perspective_time_dfs[pt_key] = non_cancel_df.groupby(perspective_time).mean()['delay_time'].reset_index()

In [8]:
# Helper functions for getting timestamp values by different bins
def get_hr_ts_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    std_hour = int(row['std_hour'])
    return flight_year * (10 ** 6) + flight_month * (10 ** 4) + flight_day * (10 ** 2) + std_hour

def get_2_hr_ts_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    flight_2_hour_bin = row['flight_2_hour_bin']
    return flight_year * (10 ** 6) + flight_month * (10 ** 4) + flight_day * (10 ** 2) + flight_2_hour_bin

def get_4_hr_ts_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    flight_4_hour_bin = row['flight_4_hour_bin']
    return flight_year * (10 ** 6) + flight_month * (10 ** 4) + flight_day * (10 ** 2) + flight_4_hour_bin

def get_day_ts_val(row):
    flight_year = row['flight_year']
    flight_month = row['flight_month']
    flight_day = row['flight_day']
    return flight_year * (10 ** 4) + flight_month * (10 ** 2) + flight_day

def get_wk_ts_val(row):
    flight_year = row['flight_year']
    flight_wk = row['Week']
    return flight_year * (10 ** 2) + flight_wk

In [9]:
# For each stat df, create a time series key such that it is easier to select "most recent" value in later stage
# This key will not become a feature in model training
for pt_key in perspective_time_dfs:
    print(pt_key)
    if pt_key.endswith('_2_hr'):
        perspective_time_dfs[pt_key]['flight_2_hour_ts'] = perspective_time_dfs[pt_key].apply(get_2_hr_ts_val, axis=1)
    elif pt_key.endswith('_4_hr'):
        perspective_time_dfs[pt_key]['flight_4_hour_ts'] = perspective_time_dfs[pt_key].apply(get_4_hr_ts_val, axis=1)
    elif pt_key.endswith('_day'):
        perspective_time_dfs[pt_key]['flight_day_ts'] = perspective_time_dfs[pt_key].apply(get_day_ts_val, axis=1)
    elif pt_key.endswith('_wk'):
        perspective_time_dfs[pt_key]['flight_wk_ts'] = perspective_time_dfs[pt_key].apply(get_wk_ts_val, axis=1)
    elif pt_key.endswith('_hr'):
        perspective_time_dfs[pt_key]['flight_ts'] = perspective_time_dfs[pt_key].apply(get_hr_ts_val, axis=1)

    # Rename column for join operation
    perspective_time_dfs[pt_key].rename(columns={'delay_time': '_'.join([pt_key, 'delay_time'])}, inplace=True)

dep_hr
dep_2_hr
dep_4_hr
dep_day
dep_wk
arr_hr
arr_2_hr
arr_4_hr
arr_day
arr_wk
air_hr
air_2_hr
air_4_hr
air_day
air_wk
arr_air_hr
arr_air_2_hr
arr_air_4_hr
arr_air_day
arr_air_wk


In [10]:
delay_time_dfs = perspective_time_dfs.copy()

# Delay status statistics

In [11]:
# Before processing delay_time statistics, remove cancalled entries first
non_cancel_df = data_df[~(data_df['delay_time'] == "Cancelled")].copy()

# Translate delay_time to float
non_cancel_df['is_delayed'] = non_cancel_df['delay_time'].apply(lambda x: float(x) >= 3.0)

# Note: groupby preserves the order of rows within each group
non_cancel_df = non_cancel_df.sort_values(['Departure', 'Arrival', 'Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour'],ascending=False)

In [12]:
# Creation of different stat dfs
perspective_time_dfs = {}
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        perspective_time = p + t
        perspective_time_dfs[pt_key] = non_cancel_df.groupby(perspective_time).sum()['is_delayed'].reset_index()

In [13]:
# For each stat df, create a time series key such that it is easier to select "most recent" value in later stage
# This key will not become a feature in model training
for pt_key in perspective_time_dfs:
    print(pt_key)
    if pt_key.endswith('_2_hr'):
        perspective_time_dfs[pt_key]['flight_2_hour_ts'] = perspective_time_dfs[pt_key].apply(get_2_hr_ts_val, axis=1)
    elif pt_key.endswith('_4_hr'):
        perspective_time_dfs[pt_key]['flight_4_hour_ts'] = perspective_time_dfs[pt_key].apply(get_4_hr_ts_val, axis=1)
    elif pt_key.endswith('_day'):
        perspective_time_dfs[pt_key]['flight_day_ts'] = perspective_time_dfs[pt_key].apply(get_day_ts_val, axis=1)
    elif pt_key.endswith('_wk'):
        perspective_time_dfs[pt_key]['flight_wk_ts'] = perspective_time_dfs[pt_key].apply(get_wk_ts_val, axis=1)
    elif pt_key.endswith('_hr'):
        perspective_time_dfs[pt_key]['flight_ts'] = perspective_time_dfs[pt_key].apply(get_hr_ts_val, axis=1)

    # Rename column for join operation
    perspective_time_dfs[pt_key].rename(columns={'is_delayed': '_'.join([pt_key, 'delay_count'])}, inplace=True)

dep_hr
dep_2_hr
dep_4_hr
dep_day
dep_wk
arr_hr
arr_2_hr
arr_4_hr
arr_day
arr_wk
air_hr
air_2_hr
air_4_hr
air_day
air_wk
arr_air_hr
arr_air_2_hr
arr_air_4_hr
arr_air_day
arr_air_wk


In [14]:
delay_count_dfs = perspective_time_dfs.copy()

# Cancelled status statistics

In [15]:
# Before processing delay_time statistics, remove cancalled entries first
cancel_df = data_df[(data_df['delay_time'] == "Cancelled")].copy()

# Translate delay_time to float
cancel_df['is_cancel'] = cancel_df['delay_time'].apply(lambda x: x == "Cancelled")

# Note: groupby preserves the order of rows within each group
cancel_df = cancel_df.sort_values(['Departure', 'Arrival', 'Airline', 'flight_year', 'flight_month', 'flight_day', 'std_hour'],ascending=False)

In [16]:
# Creation of different stat dfs
perspective_time_dfs = {}
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        perspective_time = p + t
        perspective_time_dfs[pt_key] = cancel_df.groupby(perspective_time).sum()['is_cancel'].reset_index()

In [17]:
# For each stat df, create a time series key such that it is easier to select "most recent" value in later stage
# This key will not become a feature in model training
for pt_key in perspective_time_dfs:
    print(pt_key)
    if pt_key.endswith('_2_hr'):
        perspective_time_dfs[pt_key]['flight_2_hour_ts'] = perspective_time_dfs[pt_key].apply(get_2_hr_ts_val, axis=1)
    elif pt_key.endswith('_4_hr'):
        perspective_time_dfs[pt_key]['flight_4_hour_ts'] = perspective_time_dfs[pt_key].apply(get_4_hr_ts_val, axis=1)
    elif pt_key.endswith('_day'):
        perspective_time_dfs[pt_key]['flight_day_ts'] = perspective_time_dfs[pt_key].apply(get_day_ts_val, axis=1)
    elif pt_key.endswith('_wk'):
        perspective_time_dfs[pt_key]['flight_wk_ts'] = perspective_time_dfs[pt_key].apply(get_wk_ts_val, axis=1)
    elif pt_key.endswith('_hr'):
        perspective_time_dfs[pt_key]['flight_ts'] = perspective_time_dfs[pt_key].apply(get_hr_ts_val, axis=1)

    # Rename column for join operation
    perspective_time_dfs[pt_key].rename(columns={'is_cancel': '_'.join([pt_key, 'cancel_count'])}, inplace=True)

dep_hr
dep_2_hr
dep_4_hr
dep_day
dep_wk
arr_hr
arr_2_hr
arr_4_hr
arr_day
arr_wk
air_hr
air_2_hr
air_4_hr
air_day
air_wk
arr_air_hr
arr_air_2_hr
arr_air_4_hr
arr_air_day
arr_air_wk


In [18]:
cancel_count_dfs = perspective_time_dfs.copy()

Merging the statistics to original dataset ...

In [19]:
def get_last_time_series_val(row):
    flight_dt = row['flight_dt'] - timedelta(hours=1)
    return int(flight_dt.strftime("%Y%m%d%H"))

In [20]:
feature_df = data_df.copy()
feature_df['flight_ts'] = feature_df.apply(get_last_time_series_val, axis=1)

In [21]:
# Do the same for day/week
def get_last_day_time_series_val(row):
    flight_dt = row['flight_dt']
    flight_hour_bin = row['flight_2_hour_bin']
    flight_hour_bin -= 1
    if flight_hour_bin < 0:
        flight_dt = row['flight_dt'] - timedelta(days=1)
    return int(flight_dt.strftime("%Y%m%d")) * 100 + flight_hour_bin


feature_df['flight_2_hour_ts'] = feature_df.apply(get_last_day_time_series_val, axis=1)

In [22]:
def get_last_day_time_series_val(row):
    flight_dt = row['flight_dt']
    flight_hour_bin = row['flight_4_hour_bin']
    flight_hour_bin -= 1
    if flight_hour_bin < 0:
        flight_dt = row['flight_dt'] - timedelta(days=1)
    return int(flight_dt.strftime("%Y%m%d")) * 100 + flight_hour_bin


feature_df['flight_4_hour_ts'] = feature_df.apply(get_last_day_time_series_val, axis=1)

In [23]:
def get_last_day_time_series_val(row):
    flight_dt = row['flight_dt'] - timedelta(days=1)
    return int(flight_dt.strftime("%Y%m%d"))

feature_df['flight_day_ts'] = feature_df.apply(get_last_day_time_series_val, axis=1)

In [24]:
def get_number_of_weeks_in_year(year):
    last_week = datetime(year, 12, 28)
    return last_week.isocalendar()[1]

def get_last_wk_time_series_val(row):
    flight_year = row['flight_year']
    flight_week = row['Week'] - 1
    
    if flight_week < 1:
        # Shift to last year end
        flight_year -= 1
        flight_week = get_number_of_weeks_in_year(flight_year)
        
    return flight_year * (10 ** 2) + flight_week

feature_df['flight_wk_ts'] = feature_df.apply(get_last_wk_time_series_val, axis=1)

In [25]:
merged_feature_df = feature_df.copy()
merged_feature_df

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,...,flight_year,flight_month,flight_day,flight_2_hour_bin,flight_4_hour_bin,flight_ts,flight_2_hour_ts,flight_4_hour_ts,flight_day_ts,flight_wk_ts
0,1582499,UO686,27,HKG,KIX,UO,10,0.4,2016-07-01,0,...,2016,7,1,5,2,2016070109,2016070104,2016070101,20160630,201626
1,1582501,CI7868,17,HKG,TNN,CI,11,0.5,2015-04-23,0,...,2015,4,23,5,2,2015042310,2015042304,2015042301,20150422,201516
2,1582504,PR301,14,HKG,MNL,PR,11,0.0,2014-04-08,0,...,2014,4,8,5,2,2014040810,2014040804,2014040801,20140407,201413
3,1582508,LD327,37,HKG,SIN,LD,3,0.1,2013-09-15,0,...,2013,9,15,1,0,2013091502,2013091500,2013091399,20130914,201336
4,1582509,KA5390,40,HKG,PEK,KA,9,0.5,2015-10-05,0,...,2015,10,5,4,2,2015100508,2015100503,2015100501,20151004,201539
5,1582511,NZ4851,10,HKG,IST,NZ,23,0.2,2015-03-10,0,...,2015,3,10,11,5,2015031022,2015031010,2015031004,20150309,201509
6,1582512,CX5626,51,HKG,HGH,CX,7,0.4,2015-12-19,0,...,2015,12,19,3,1,2015121906,2015121902,2015121900,20151218,201550
7,1582513,MH9725,8,HKG,KUL,MH,12,0.1,2014-02-23,0,...,2014,2,23,6,3,2014022311,2014022305,2014022302,20140222,201407
8,1582516,KA154,42,HKG,BLR,KA,21,0.0,2014-10-21,0,...,2014,10,21,10,5,2014102120,2014102109,2014102104,20141020,201441
9,1582517,CX233,10,HKG,MXP,CX,0,0.3,2015-03-05,0,...,2015,3,5,0,0,2015030423,2015030399,2015030399,20150304,201509


In [26]:
# Merge generated stat to original dataset
# The merge operation is based on last hr/day/wk from current row's datetime, thus it is assumed those statistics can be calculated for new predictions

# Delay time
perspective_time_dfs = delay_time_dfs
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        
        if pt_key.endswith('_2_hr'):
            ts_val = 'flight_2_hour_ts'
        elif pt_key.endswith('_4_hr'):
            ts_val = 'flight_4_hour_ts'
        elif pt_key.endswith('_day'):
            ts_val = 'flight_day_ts'
        elif pt_key.endswith('_wk'):
            ts_val = 'flight_wk_ts'
        elif pt_key.endswith('_hr'):
            ts_val = 'flight_ts'
        
        perspective_time = p + [ts_val]
        to_add_col = '_'.join([pt_key, 'delay_time'])
        to_merge_cols = p + [ts_val] + [to_add_col]
        to_merge_df = perspective_time_dfs[pt_key][to_merge_cols]
        merged_feature_df = merged_feature_df.merge(to_merge_df, how='left', left_on=perspective_time, right_on=perspective_time)

In [27]:
# Delay count
perspective_time_dfs = delay_count_dfs
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        
        print(pt_key)
        if pt_key.endswith('_2_hr'):
            ts_val = 'flight_2_hour_ts'
        elif pt_key.endswith('_4_hr'):
            ts_val = 'flight_4_hour_ts'
        elif pt_key.endswith('_day'):
            ts_val = 'flight_day_ts'
        elif pt_key.endswith('_wk'):
            ts_val = 'flight_wk_ts'
        elif pt_key.endswith('_hr'):
            ts_val = 'flight_ts'
        
        perspective_time = p + [ts_val]
        to_add_col = '_'.join([pt_key, 'delay_count'])
        to_merge_cols = p + [ts_val] + [to_add_col]
        to_merge_df = perspective_time_dfs[pt_key][to_merge_cols]
        merged_feature_df = merged_feature_df.merge(to_merge_df, how='left', left_on=perspective_time, right_on=perspective_time)

dep_hr
dep_2_hr
dep_4_hr
dep_day
dep_wk
arr_hr
arr_2_hr
arr_4_hr
arr_day
arr_wk
air_hr
air_2_hr
air_4_hr
air_day
air_wk
arr_air_hr
arr_air_2_hr
arr_air_4_hr
arr_air_day
arr_air_wk


In [28]:
# Cancel count
perspective_time_dfs = cancel_count_dfs
for p_key in perspectives:
    for t_key in time_bins:
        p = perspectives[p_key]
        t = time_bins[t_key]
        
        pt_key = p_key + "_" + t_key
        
        if pt_key.endswith('_2_hr'):
            ts_val = 'flight_2_hour_ts'
        elif pt_key.endswith('_4_hr'):
            ts_val = 'flight_4_hour_ts'
        elif pt_key.endswith('_day'):
            ts_val = 'flight_day_ts'
        elif pt_key.endswith('_wk'):
            ts_val = 'flight_wk_ts'
        elif pt_key.endswith('_hr'):
            ts_val = 'flight_ts'
        
        perspective_time = p + [ts_val]
        to_add_col = '_'.join([pt_key, 'cancel_count'])
        to_merge_cols = p + [ts_val] + [to_add_col]
        to_merge_df = perspective_time_dfs[pt_key][to_merge_cols]
        merged_feature_df = merged_feature_df.merge(to_merge_df, how='left', left_on=perspective_time, right_on=perspective_time)

In [29]:
merged_feature_df.columns

Index(['flight_id', 'flight_no', 'Week', 'Departure', 'Arrival', 'Airline',
       'std_hour', 'delay_time', 'flight_date', 'is_claim', 'flight_date_dt',
       'flight_dt', 'flight_year', 'flight_month', 'flight_day',
       'flight_2_hour_bin', 'flight_4_hour_bin', 'flight_ts',
       'flight_2_hour_ts', 'flight_4_hour_ts', 'flight_day_ts', 'flight_wk_ts',
       'dep_hr_delay_time', 'dep_2_hr_delay_time', 'dep_4_hr_delay_time',
       'dep_day_delay_time', 'dep_wk_delay_time', 'arr_hr_delay_time',
       'arr_2_hr_delay_time', 'arr_4_hr_delay_time', 'arr_day_delay_time',
       'arr_wk_delay_time', 'air_hr_delay_time', 'air_2_hr_delay_time',
       'air_4_hr_delay_time', 'air_day_delay_time', 'air_wk_delay_time',
       'arr_air_hr_delay_time', 'arr_air_2_hr_delay_time',
       'arr_air_4_hr_delay_time', 'arr_air_day_delay_time',
       'arr_air_wk_delay_time', 'dep_hr_delay_count', 'dep_2_hr_delay_count',
       'dep_4_hr_delay_count', 'dep_day_delay_count', 'dep_wk_delay_count',
 

In [30]:
merged_feature_df[['dep_hr_delay_count', 'dep_2_hr_delay_count',
       'dep_4_hr_delay_count', 'dep_day_delay_count', 'dep_wk_delay_count',
       'arr_hr_delay_count', 'arr_2_hr_delay_count', 'arr_4_hr_delay_count',
       'arr_day_delay_count', 'arr_wk_delay_count', 'air_hr_delay_count',
       'air_2_hr_delay_count', 'air_4_hr_delay_count', 'air_day_delay_count',
       'air_wk_delay_count', 'arr_air_hr_delay_count',
       'arr_air_2_hr_delay_count', 'arr_air_4_hr_delay_count',
       'arr_air_day_delay_count', 'arr_air_wk_delay_count',
       'dep_hr_cancel_count', 'dep_2_hr_cancel_count', 'dep_4_hr_cancel_count',
       'dep_day_cancel_count', 'dep_wk_cancel_count', 'arr_hr_cancel_count',
       'arr_2_hr_cancel_count', 'arr_4_hr_cancel_count',
       'arr_day_cancel_count', 'arr_wk_cancel_count', 'air_hr_cancel_count',
       'air_2_hr_cancel_count', 'air_4_hr_cancel_count',
       'air_day_cancel_count', 'air_wk_cancel_count',
       'arr_air_hr_cancel_count', 'arr_air_2_hr_cancel_count',
       'arr_air_4_hr_cancel_count', 'arr_air_day_cancel_count',
       'arr_air_wk_cancel_count']] = merged_feature_df[['dep_hr_delay_count', 'dep_2_hr_delay_count',
       'dep_4_hr_delay_count', 'dep_day_delay_count', 'dep_wk_delay_count',
       'arr_hr_delay_count', 'arr_2_hr_delay_count', 'arr_4_hr_delay_count',
       'arr_day_delay_count', 'arr_wk_delay_count', 'air_hr_delay_count',
       'air_2_hr_delay_count', 'air_4_hr_delay_count', 'air_day_delay_count',
       'air_wk_delay_count', 'arr_air_hr_delay_count',
       'arr_air_2_hr_delay_count', 'arr_air_4_hr_delay_count',
       'arr_air_day_delay_count', 'arr_air_wk_delay_count',
       'dep_hr_cancel_count', 'dep_2_hr_cancel_count', 'dep_4_hr_cancel_count',
       'dep_day_cancel_count', 'dep_wk_cancel_count', 'arr_hr_cancel_count',
       'arr_2_hr_cancel_count', 'arr_4_hr_cancel_count',
       'arr_day_cancel_count', 'arr_wk_cancel_count', 'air_hr_cancel_count',
       'air_2_hr_cancel_count', 'air_4_hr_cancel_count',
       'air_day_cancel_count', 'air_wk_cancel_count',
       'arr_air_hr_cancel_count', 'arr_air_2_hr_cancel_count',
       'arr_air_4_hr_cancel_count', 'arr_air_day_cancel_count',
       'arr_air_wk_cancel_count']].fillna(0)

In [31]:
merged_feature_df['is_claim'].value_counts()

0      859701
800     39413
Name: is_claim, dtype: int64

In [32]:
merged_feature_df.dropna()['is_claim'].value_counts()

0      73249
800     4178
Name: is_claim, dtype: int64

In [33]:
merged_feature_df

Unnamed: 0,flight_id,flight_no,Week,Departure,Arrival,Airline,std_hour,delay_time,flight_date,is_claim,...,air_hr_cancel_count,air_2_hr_cancel_count,air_4_hr_cancel_count,air_day_cancel_count,air_wk_cancel_count,arr_air_hr_cancel_count,arr_air_2_hr_cancel_count,arr_air_4_hr_cancel_count,arr_air_day_cancel_count,arr_air_wk_cancel_count
0,1582499,UO686,27,HKG,KIX,UO,10,0.4,2016-07-01,0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0
1,1582501,CI7868,17,HKG,TNN,CI,11,0.5,2015-04-23,0,...,0.0,1.0,0.0,4.0,15.0,0.0,0.0,0.0,1.0,2.0
2,1582504,PR301,14,HKG,MNL,PR,11,0.0,2014-04-08,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1582508,LD327,37,HKG,SIN,LD,3,0.1,2013-09-15,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1582509,KA5390,40,HKG,PEK,KA,9,0.5,2015-10-05,0,...,1.0,0.0,0.0,23.0,32.0,0.0,0.0,0.0,5.0,0.0
5,1582511,NZ4851,10,HKG,IST,NZ,23,0.2,2015-03-10,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1582512,CX5626,51,HKG,HGH,CX,7,0.4,2015-12-19,0,...,0.0,0.0,0.0,2.0,23.0,0.0,0.0,0.0,0.0,0.0
7,1582513,MH9725,8,HKG,KUL,MH,12,0.1,2014-02-23,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1582516,KA154,42,HKG,BLR,KA,21,0.0,2014-10-21,0,...,0.0,0.0,0.0,2.0,11.0,0.0,0.0,0.0,0.0,0.0
9,1582517,CX233,10,HKG,MXP,CX,0,0.3,2015-03-05,0,...,0.0,0.0,0.0,5.0,11.0,0.0,0.0,0.0,0.0,0.0


In [34]:
merged_feature_df[merged_feature_df['flight_id'] == 1582511].to_dict()

{'flight_id': {5: 1582511},
 'flight_no': {5: 'NZ4851'},
 'Week': {5: 10},
 'Departure': {5: 'HKG'},
 'Arrival': {5: 'IST'},
 'Airline': {5: 'NZ'},
 'std_hour': {5: 23},
 'delay_time': {5: '0.2'},
 'flight_date': {5: '2015-03-10'},
 'is_claim': {5: 0},
 'flight_date_dt': {5: Timestamp('2015-03-10 00:00:00')},
 'flight_dt': {5: Timestamp('2015-03-10 23:00:00')},
 'flight_year': {5: 2015},
 'flight_month': {5: 3},
 'flight_day': {5: 10},
 'flight_2_hour_bin': {5: 11},
 'flight_4_hour_bin': {5: 5},
 'flight_ts': {5: 2015031022},
 'flight_2_hour_ts': {5: 2015031010},
 'flight_4_hour_ts': {5: 2015031004},
 'flight_day_ts': {5: 20150309},
 'flight_wk_ts': {5: 201509},
 'dep_hr_delay_time': {5: 0.45},
 'dep_2_hr_delay_time': {5: 0.31979166666666653},
 'dep_4_hr_delay_time': {5: 0.14215686274509798},
 'dep_day_delay_time': {5: 0.16453900709219832},
 'dep_wk_delay_time': {5: 0.3085131292858329},
 'arr_hr_delay_time': {5: nan},
 'arr_2_hr_delay_time': {5: nan},
 'arr_4_hr_delay_time': {5: nan},


In [35]:
test_df = delay_time_dfs['dep_4_hr']
test_df[test_df['flight_4_hour_ts'] == 2015031004]

Unnamed: 0,Departure,flight_year,flight_month,flight_day,flight_4_hour_bin,dep_4_hr_delay_time,flight_4_hour_ts
3333,HKG,2015,3,10,4,0.142157,2015031004


In [40]:
merged_feature_df.to_csv('../datasets/flight_delays_data_transformed.csv')