In [141]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import joblib
import pickle

In [191]:
import joblib
import pandas as pd

# Load the saved encoders and model
airline_encoder = joblib.load('airline_encoder.pkl')
dest_encoder = joblib.load('dest_encoder.pkl')
origin_encoder = joblib.load('origin_encoder.pkl')
flight_duration_encoder = joblib.load('flight_duration_encoder.pkl')
xgb_model = joblib.load('xgb_model.pkl')  #  delay>15
xgb_model_2 = joblib.load('xgb_model_2.pkl')  #  delay>30
xgb_model_3 = joblib.load('xgb_model_3.pkl')  #  delay>45
xgb_model_4 = joblib.load('xgb_model_4.pkl')  #  delay>60
X_train_columns = joblib.load('X_train_columns.pkl') 
centrality_data=joblib.load('centrality_data.pkl')

# Create a function to preprocess the input data and predict
def predict_flight_delay(user_input):


    #print(xgb_model_4.get_params())

    # Convert user input into a DataFrame
    input_df = pd.DataFrame([user_input])

    # Transform flight date into day_of_week, month
    input_df['FL_DATE'] = pd.to_datetime(input_df['FL_DATE'],format='%Y-%m-%d')
    input_df['day_of_week'] = input_df['FL_DATE'].dt.dayofweek  # Monday=0, Sunday=6
    input_df['month'] = input_df['FL_DATE'].dt.month

    
    # Extract hour of the day from scheduled departure and arr time
    input_df['hour_of_day_Dep'] = input_df['CRS_DEP_TIME'] // 100  # Assuming CRS_DEP_TIME is in HHMM format
    input_df['hour_of_day_Arr'] = input_df['CRS_ARR_TIME'] // 100

    # Define time category 
    def categorize_time_of_day(hour):
        if 5 <= hour < 12:
            return 'Morning'
        elif 12 <= hour < 17:
            return 'Afternoon'
        elif 17 <= hour < 21:
            return 'Evening'
        else:
            return 'Night'
    
    # Apply function to create hour-of-day columns
    input_df['DEP_TIME_OF_DAY'] = input_df['hour_of_day_Dep'].apply(categorize_time_of_day)
    input_df['ARR_TIME_OF_DAY'] = input_df['hour_of_day_Arr'].apply(categorize_time_of_day)
        
    # Apply categorization for flight duration
    def categorize_elapsed_time(mins):
        if mins <= 180:
            return 'Short'
        elif 180 < mins <= 360:
            return 'Medium'
        elif 360 < mins <= 720:
            return 'Long'
        else:
            return 'Ultra-long'
    
    input_df['flight_duration'] = input_df['CRS_ELAPSED_TIME'].apply(categorize_elapsed_time)


    # Apply label encoding for AIRLINE_CODE, DEST, ORIGIN and flight_duration
    input_df['AIRLINE_CODE_Encoded'] = airline_encoder.transform(input_df['AIRLINE_CODE'])
    input_df['DEST_Encoded'] = dest_encoder.transform(input_df['DEST'])
    input_df['ORIGIN_Encoded'] = origin_encoder.transform(input_df['ORIGIN'])
    input_df['flight_duration_Encoded'] = flight_duration_encoder.transform(input_df['flight_duration'])

    # Shift the encoding to start from 1 instead of 0
    input_df['AIRLINE_CODE_Encoded'] += 1
    input_df['DEST_Encoded']+=1
    input_df['ORIGIN_Encoded']+=1
    input_df['flight_duration_Encoded']+=1

    # Interaction terms
    input_df['Origin_Dest_interaction'] = input_df['ORIGIN_Encoded'] * input_df['DEST_Encoded']
    input_df['flight_duration_AirlineCode_interaction'] = input_df['flight_duration_Encoded'] * input_df['AIRLINE_CODE_Encoded']
    input_df['AirlineCode_FlightNumber_Origin_Dest_interaction'] = input_df['AIRLINE_CODE_Encoded'] * input_df['FL_NUMBER']*input_df['ORIGIN_Encoded'] * input_df['DEST_Encoded']

    #################################################
    # Extract AIRPORT_SIZE from saved dicts
    with open('origin_dicts.pkl','rb') as f:
        origin_mapping_dict = pickle.load(f)
    with open('dest_dicts.pkl','rb') as f:
        dest_mapping_dict = pickle.load(f)
        
    def get_airport_size(encoded_value, airport_type='origin'):
        if airport_type=='origin':
            size=origin_mapping_dict.get(encoded_value,None)
        elif airport_type=='dest':
            size=dest_mapping_dict.get(encoded_value,None)
        return size
        
    origin_encoded_value = input_df['ORIGIN_Encoded'].iloc[0]
    dest_encoded_value = input_df['DEST_Encoded'].iloc[0]
    
    input_df['ORIGIN_AIRPORT_SIZE']=get_airport_size(origin_encoded_value,airport_type='origin')
    input_df['DEST_AIRPORT_SIZE']=get_airport_size(dest_encoded_value,airport_type='dest')
    # print(f"ORIGIN_AIRPORT_SIZE: {input_df['ORIGIN_AIRPORT_SIZE'].iloc[0]}")
    # print(f"DEST_AIRPORT_SIZE: {input_df['DEST_AIRPORT_SIZE'].iloc[0]}")

    #################################################
    # Extract COUNTRY from saved dicts
    with open('origin_country_dicts.pkl','rb') as f:
        origin_country_dict = pickle.load(f)
    with open('dest_country_dicts.pkl','rb') as f:
        dest_country_dict = pickle.load(f)
        
    def get_COUNTRY(encoded_value, airport_type='origin'):
        if airport_type=='origin':
            country=origin_country_dict.get(encoded_value,None)
        elif airport_type=='dest':
            country=dest_country_dict.get(encoded_value,None)
        return country
    
    input_df['ORIGIN_COUNTRY']=get_COUNTRY(origin_encoded_value,airport_type='origin')
    input_df['DEST_COUNTRY']=get_COUNTRY(dest_encoded_value,airport_type='dest')
    # print(f"ORIGIN_COUNTRY: {input_df['ORIGIN_COUNTRY'].iloc[0]}")
    # print(f"DEST_COUNTRY: {input_df['DEST_COUNTRY'].iloc[0]}")

    #########################################################
    
    # Add "FL_Type" = Domestic or International
    input_df['FL_Type'] = input_df.apply(lambda x: 'Domestic' if x['ORIGIN_COUNTRY'] == x['DEST_COUNTRY'] else 'International', axis=1)

    # Apply one-hot encoding to other categorical columns
    input_df = pd.get_dummies(input_df, columns=['DEP_TIME_OF_DAY', 'ARR_TIME_OF_DAY', 'ORIGIN_COUNTRY',
                                                'ORIGIN_AIRPORT_SIZE','DEST_COUNTRY','DEST_AIRPORT_SIZE','FL_Type','flight_duration'],
                                                  drop_first=False)

    input_df = input_df.drop(['FL_DATE', 'CRS_DEP_TIME', 'CRS_ARR_TIME','CRS_ELAPSED_TIME',
                              'AIRLINE_CODE', 'DEST','ORIGIN'], axis=1)
    #print(input_df.columns)
    # Ensure the input dataframe has the same columns as the training data
    input_df = input_df.reindex(columns=X_train_columns, fill_value=0)
    # print(input_df.columns)

    
    # Retrieve centrality values from centrality_data if available
    dest_degree_centrality_value = centrality_data.loc[centrality_data['DEST_Encoded'] == dest_encoded_value, 'dest_degree_centrality']
    dest_betweenness_centrality_value = centrality_data.loc[centrality_data['DEST_Encoded'] == dest_encoded_value, 'dest_betweenness_centrality']
    
    # If the value is found, it will return a Series. Extract the scalar value or fallback to 0
    input_df['dest_degree_centrality'] = dest_degree_centrality_value.iloc[0] if not dest_degree_centrality_value.empty else 0
    input_df['dest_betweenness_centrality'] = dest_betweenness_centrality_value.iloc[0] if not dest_betweenness_centrality_value.empty else 0
    
    #Print centrality values for this specific flight
    # print(f"Destination Degree Centrality: {input_df['dest_degree_centrality'].iloc[0]}")
    # print(f"Destination Betweenness Centrality: {input_df['dest_betweenness_centrality'].iloc[0]}")
    

    # Make the prediction
    prediction_4 = xgb_model_4.predict(input_df)
    prediction_3 = xgb_model_3.predict(input_df)
    prediction_2 = xgb_model_2.predict(input_df)
    prediction = xgb_model.predict(input_df)

    # Save the transformed dataset to a CSV file
    input_df.to_csv('transformed_flight_input_3.csv', index=False)
    
    print("Transformed dataset has been saved to 'transformed_flight_input_3.csv'.")
    
    return prediction_4[0], prediction_3[0], prediction_2[0],prediction[0]


In [193]:
def get_user_input():
    # Collect user input for each feature (in this case, as an example)
    user_input = {
        'FL_DATE': '2024-06-20', 
        'CRS_DEP_TIME': 1230,  
        'CRS_ARR_TIME': 1425,
        'CRS_ELAPSED_TIME': 60, 
        'FL_NUMBER': 5535, 
        'AIRLINE_CODE': 'AS',  
        'DEST': 'LGB',  
        'ORIGIN': 'DHN',
    }
    return user_input

In [201]:
# Get user input
user_input = get_user_input()

# Predict flight delay
prediction_4, prediction_3, prediction_2, prediction = predict_flight_delay(user_input)


# Define time category 
def prediction_outcome(prediction_4, prediction_3, prediction_2, prediction):
    if prediction_4 ==1:
        return 'Estimated flight delay: Over 1 hour.'

    elif prediction_3 ==1:
        return 'Estimated flight delay: Between 45 mins to 1 hour.'

    elif prediction_2==1:
        return 'Estimated flight delay: Between 30 mins to 45 mins.'
    elif prediction ==1:
        return 'Estimated flight delay: Between 15 mins to 30 mins.'
    else:
        return 'On-time'


# Output each prediction's category
print("######Prediction Outcomes######")
print(prediction_outcome(prediction_4,prediction_3,prediction_2,prediction))



Transformed dataset has been saved to 'transformed_flight_input_3.csv'.
######Prediction Outcomes######
Estimated flight delay: Between 15 mins to 30 mins.


In [199]:
## Below cells are used for other testing