## Flight Delay Claims Prediction – Model Prediction

This notebook contains the source code of the model prediction process of the Flight Delay Claims Prediction project.

### Import packages

In [None]:
import requests

import numpy as np
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from joblib import dump, load

### Configure variables

Before running the notebook, please configure the source CSV file path and the output CSV file path.

In [None]:
source_csv_path = 'REPLACE THIS WITH SOURCE CSV PATH'
output_csv_path = 'REPLACE THIS WITH OUTPUT CSV PATH'

In [None]:
helper_files_path = '../models/helper_files'
best_model_path = '../models/model_flights_r2_37.pth'

### Load data

The CSV file is first loaded as a pandas data frame, which allows it to be read and manipulated easily.

In [None]:
flights_original = pd.read_csv(source_csv_path, parse_dates=['flight_date'])
flights = flights_original.copy()

### Data cleaning

The thought process of the following data cleaning procedures has been covered in the EDA notebook.

#### Fill in missing airline

In [None]:
flights.loc[flights.Airline.isnull(), 'Airline'] = flights.loc[flights.Airline.isnull(), 'flight_no'].str[:2]

### Feature engineering

The thought process of the following feature engineering procedures has been covered in the EDA notebook.

#### Year, month, day, and day of week

In [None]:
flights['flight_date_year'] = flights.flight_date.dt.year
flights['flight_date_month'] = flights.flight_date.dt.month
flights['flight_date_day'] = flights.flight_date.dt.day
flights['flight_date_dow'] = flights.flight_date.dt.dayofweek

flights.Week = flights.Week.astype('category')
flights.flight_date_year = flights.flight_date_year.astype('category')
flights.flight_date_month = flights.flight_date_month.astype('category')
flights.flight_date_day = flights.flight_date_day.astype('category')
flights.flight_date_dow = flights.flight_date_dow.astype('category')

#### Hong Kong public holidays

In [None]:
def get_hk_holiday_data(years):
    public_holidays = list()

    for year in years:
        holiday_url = f'https://www.gov.hk/en/about/abouthk/holiday/{year}.htm'
        r = requests.get(holiday_url, headers={'User-Agent': 'Mozilla/5.0'})
        holiday_dates = pd.read_html(r.text, skiprows=1)[0][1].apply(lambda x: f'{x} {year}')
        holiday_dates = pd.to_datetime(holiday_dates, infer_datetime_format=True)
        public_holidays.extend(holiday_dates)
    
    return public_holidays

In [None]:
flight_years = flights.flight_date_year.unique()

In [None]:
public_holidays = get_hk_holiday_data(flight_years)
flights['is_public_holiday'] = np.where(flights.flight_date.isin(public_holidays), 1, 0)

#### Hong Kong weather data

In [None]:
def get_weather_data(years):
    weather_list = list()

    for year in years:
        weather_url = f'https://www.hko.gov.hk/cis/dailyExtract/dailyExtract_{year}'
        r = requests.get(weather_url, params={'y': year})
        weather_data = r.json()['stn']['data']

        for elem_month in weather_data:
            month = elem_month['month']
            day_data = elem_month['dayData'][:-2]
            for elem_day in day_data:
                day = elem_day[0]
                mean_pressure = float(elem_day[1])
                mean_temp = float(elem_day[3])
                mean_dew_point = float(elem_day[5])
                mean_humidity = float(elem_day[6])
                mean_cloud = float(elem_day[7])
                mean_rainfall = float(elem_day[8]) if elem_day[8] != 'Trace' else 0.0
                weather_list.append({'flight_date': pd.to_datetime(f'{year}-{month:02}-{day}'),
                                     'mean_pressure': mean_pressure, 'mean_temp': mean_temp, 'mean_dew_point': mean_dew_point,
                                     'mean_humidity': mean_humidity, 'mean_cloud': mean_cloud, 'total_rainfall': mean_rainfall})

    weather_df = pd.DataFrame(weather_list)
    return weather_df

In [None]:
weather_df = get_weather_data(flight_years)
flights = pd.merge(flights, weather_df, on='flight_date', how='left')

#### Clean up

In [None]:
# Convert departure hour to category
flights.std_hour = flights.std_hour.astype('category')

# Drop unnecessary flight_date and flight_id column
flights = flights.drop(columns=['flight_date', 'flight_id'])

In [None]:
cat_cols_idx = [flights.columns.get_loc(c) for c in list(flights.select_dtypes(exclude=[np.number]).columns)]
num_cols_idx = [flights.columns.get_loc(c) for c in list(flights.select_dtypes(include=[np.number]).columns)]

### Modeling

In [None]:
X = np.array(flights)

In [None]:
def encode_cat_variables(x, help_dict = None):
    """
    Encodes a categorical variable.
    The index 0 is left for values not in training.
    """
    uniqs = np.unique(x)
    if help_dict is None: help_dict = {v: k + 1 for k, v in enumerate(uniqs)}
    levels = len(help_dict.keys()) + 1
    x_t = np.array([help_dict.get(x_i, 0) for x_i in x])
    return x_t, help_dict, levels

In [None]:
def transform_dataset(X, cat_ind, num_ind):
    """
    Transform the dataset by encoding features.
    """
    X_cat = X[:, cat_ind]
    X_num = X[:, num_ind]

    # Transform numerical variables
    scaler = load(f'{helper_files_path}/scaler.joblib')
    X_num = scaler.transform(X_num)

    # Transform categorical variables
    level_arr = [0] * X_cat.shape[1]
    help_dict = load(f'{helper_files_path}/help_dict.joblib')
    for i in range(X_cat.shape[1]):
        level_arr[i] = len(help_dict[i].keys()) + 1
        x, _, _ = encode_cat_variables(X_cat[:, i], help_dict[i])
        X_cat[:, i] = x

    X_cat = np.array(X_cat).astype(int)

    return (X_cat, X_num), level_arr, scaler, help_dict

In [None]:
(X_cat, X_num), level_arr, _, _ = transform_dataset(X, cat_cols_idx, num_cols_idx)

#### Dataset and Dataloader

In [None]:
class TabularDataSet(Dataset):
    """
    Dataset object for tabular data.
    """
    def __init__(self, X_cat, X_num):
        self.X_cat = X_cat
        self.X_num = X_num

    def __getitem__(self, index):
        return self.X_cat[index], self.X_num[index]

    def __len__(self):
        return len(self.X_cat)

In [None]:
batch_size = 10000
X_ds = TabularDataSet(X_cat, X_num)
X_dl = DataLoader(X_ds, batch_size=batch_size)

#### Model definition

In [None]:
class TabularNet(nn.Module):
    """
    2 layer fully connected neural network model.
    """
    def __init__(self, num_cont, num_cat, level_arr, hidden_dim=1000, hidden_dim2=1000):
        super(TabularNet, self).__init__()
        in_dim = num_cont + 2 * num_cat
        self.embs = nn.ModuleList([nn.Embedding(level_arr[i], 2) for i in range(len(level_arr))])
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.linear1 = nn.Linear(in_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim2)
        self.linear3 = nn.Linear(hidden_dim2, 1)
        self.dropout = nn.Dropout(0.2)
                                  
    def forward(self, x_cat, x_cont):
        x_cat = [self.embs[i](x_cat[:,i]) for i in range(x_cat.size(1))]
        x_cat = torch.cat(x_cat, dim=1)
        x_cat = self.dropout(x_cat)
        x = torch.cat([x_cont, x_cat], dim=1)
        x = self.bn1(F.relu(self.linear1(x)))
        x = self.dropout(x)
        x = self.bn2(F.relu(self.linear2(x)))
        return self.linear3(x)

#### Load saved model

In [None]:
def load_model(m, p): m.load_state_dict(torch.load(p))

In [None]:
num_cont = X_num.shape[1]
num_cat = X_cat.shape[1]

model = TabularNet(num_cont, num_cat, level_arr)
load_model(model, best_model_path)

#### Prediction

In [None]:
def predict(model, dl):
    model.eval()
    delay_time = []
    
    for x1, x2 in dl:
        out = model(x1.long(), x2.float()) #.cuda()
        delay_time.extend(out.detach().numpy().flatten()) #.cpu()
    
    delay_time = np.array(delay_time)
    is_claim = np.where(delay_time > 3, 800, 0)
    return delay_time, is_claim

In [None]:
delay_time, is_claim = predict(model, X_dl)
flights_original['delay_time'] = delay_time
flights_original['is_claim'] = is_claim

In [None]:
flights_original.to_csv(output_csv_path, index=False)