# User Start
## i. Overview
The User Start module is used to run our use case. A user provides the origin, destination, and date (in 2022) they "plan" to fly. Afterward, the program will predict the delay for all relevant flights and select the best flights with the least delay. This program assumes that the data for 2022 has been preprocessed to work for this program, and assumes that the model has already been built.

## ii. Special Notes
1. The file is always expected to be loaded as a dense matrix. For better processing speed, we can convert the dense matrix into a sparse matrix.
2. We need to figure out how to improve processing speed while keeping up performance.

## iii. Methodology
### Load Model

### Request User for Input

### Predict Delay and Recommend Best Flights





In [17]:
import pandas as pd
import numpy as np
import sklearn as skl
import pickle
from sklearn.metrics import f1_score, recall_score, precision_score

#ignore warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

flight_registry_file = "data/airport_registry.parquet"

# flight_data_file = "data/testData_2022"
flight_data_file = "data/testDataBinaryTarget_2022"
# flight_data_file = "data/trainData_2018-2019"
# flight_data_file = "data/trainDataBinaryTarget_2018-2019"

airport_translation_file = "data/translation_Origin"
airline_translation_file = "data/translation_Airline"

# model_file = "models/final_models/model_logistic_regression.pkl"
model_file = "models/final_models/binary_model_naive_bayes.pkl"

test=True

# target = 'BinArrDelayMinutes'
target = 'BinaryArrDelayMinutes'

In [18]:
#load flight registry
flight_registry = pd.read_parquet(flight_registry_file, engine="fastparquet")

#load preprocessed 2022 dataset (only good from Dec 31, 2021 to Jul 30, 2022)
flight_data = pd.read_parquet(flight_data_file, engine="fastparquet")
airport_translation_data = pd.read_parquet(airport_translation_file, engine="fastparquet")
airline_translation_data = pd.read_parquet(airline_translation_file, engine="fastparquet")

#load model
model = pickle.load(open(model_file, 'rb'))

# columns to predict on
selectedFeatures = model.feature_names_in_

In [19]:
#### FUNCTIONS

## GetRelevantAirports
#retrieves relevant airport from flight registry with origin and destination provided
#possible inputs are airport names such as "SEA" or city name such as "Seattle"

# the user has provided the departure city or airport, arrival city or airport, and date
# now we find all relevant airports (date is not needed)
def GetRelevantAirports(departure, arrival, data):
    print("--- Start GetRelevantAirports --- \n")
    #translate departure and arrival input to markets
    #check if airport name contains input
    departure_search = data[data['AirportName']==departure]
    if not len(departure_search):
        #not found, search city name instead
        departure_search = data[data['CityName'].str.contains(departure)]
        if not len(departure_search):
            #error return nothing
            return None
    #get the market from the departure search result
    print(departure_search)
    departure_market = departure_search.iloc[0]['CityMarketID']

    #do the same for arrival
    arrival_search = data[data['AirportName']==arrival]
    if not len(arrival_search):
        arrival_search = data[data['CityName'].str.contains(arrival)]
        if not len(arrival_search):
            #error return nothing
            return None
    print(arrival_search)
    arrival_market = arrival_search.iloc[0]['CityMarketID']

    #filter the data for our airports
    departures = data[data['CityMarketID']==departure_market]
    arrivals = data[data['CityMarketID']==arrival_market]

    #we only care about airport name
    filtered_data={
        "origins": departures['AirportName'],
        "dests": arrivals['AirportName']
    }

    print(filtered_data)
    print("\n--- End GetRelevantAirports --- \n")
    return filtered_data

## EncodeAirport
# given translation data and list of airports 
def EncodeAirport(translation, airport_list):
    print("--- Start TranslateAirport --- \n")
    origin = translation[translation['Label'].isin(airport_list['origins'])]
    dest = translation[translation['Label'].isin(airport_list['dests'])]
    print("Origin Airports:")
    print(origin)
    print("Dest Airports:")
    print(dest)
    
    translations = {
        'origins': origin['Translation'],
        'dests': dest['Translation'],
    }
    print("\n--- End TranslateAirport --- \n")
    return translations

## GetFlightsFromData
#gets the rest of the data for relevant flights
#returns filtered dataset or empty dataframe
#note that if you end up with an empty dataframe, you should ask the user to try another date
#date is an array where index 0 is month and index 1 is day of the month
def GetFlightsFromData(dataset, relevant_airports, date):
    print("--- Start GetFlightsFromData --- \n")
    #get flights in dataset given relevant airports and date
    #filter one at a time to avoid exception
    flights = dataset[dataset['Origin'].isin(relevant_airports['origins'])]
    flights = flights[flights['Dest'].isin(relevant_airports['dests'])]
    flights = flights[flights['Month'] == date[0]]
    flights = flights[flights['DayofMonth'] == date[1]]
    print(flights.head())
    print("\n--- End GetFlightsFromData --- \n")
    return flights
    
## PredictBestFlight
#runs model on relevant flights
#find best predict delay
#return flights with best delay
#date is an array where index 0 is month and index 1 is day of the month
def PredictBestFlight(model, features, flights):
    print("--- Start PredictBestFlight --- \n")
    
    #run prediction
    X = flights[features]
    predictions = model.predict(X)
    print(predictions)
    predicted_data = flights
    predicted_data['PredArrDelayMinutes'] = predictions
    
    #get recommended flights   
    recommended_flights = predicted_data[predicted_data['PredArrDelayMinutes'] == predicted_data['PredArrDelayMinutes'].min()]

    #include actual delay in the return for analysis
    Y=flights[target]

    print("\n--- End PredictBestFlight --- \n")
    return {
        'recommended_flights': recommended_flights,
        'predicted_data': predicted_data,
        'predictions': predictions, 
        'actuals': Y
    }

## EncodeAirport
# given translation data and dataset of airline, origin, dest
def DecodeToAirlineAndAirports(airline_translation, airport_translation, data):
    print("--- Start DecodeToAirlineAndAirports ---")
    airlines = list()
    origins = list()
    dests = list()
    for index, row in data.iterrows():
        airline = airline_translation[airline_translation['Translation']==row['Airline']].iloc[0]['Label']
        origin = airport_translation[airport_translation['Translation']==row['Origin']].iloc[0]['Label']
        dest = airport_translation[airport_translation['Translation']==row['Dest']].iloc[0]['Label']
        print(airline+origin+dest)
        airlines.append(airline)
        origins.append(origin)
        dests.append(dest)

    translated_data = pd.DataFrame.from_dict({
        'Airline': airlines,
        'Origin': origins,
        'Dest': dests,
        'Pred': data['PredArrDelayMinutes'],
        'Actual': data[target]
    })
    print("--- End DecodeToAirlineAndAirports --- \n")
    return translated_data

## GetPerformance
def GetPerformance(Y_test, Y_predict):
    f1 = f1_score(Y_test, Y_predict, average='macro')
    recall = recall_score(Y_test, Y_predict, average='macro')
    precision = precision_score(Y_test, Y_predict, average='macro')

    score = {
        'f1': f1, 
        'recall': recall,
        'precision': precision
    }
    return score

## GetInput
#gets details from user
#date is an array where index 0 is month and index 1 is day of the month
def GetInput(test):
    user_input = {
            'departure_city': 'LAX',
            'arrival_city': 'San Francisco',
            'date': [3, 29]
            # 'date': [4,10]
        }
    #for quicker testing
    if test:
        return user_input
    
    #get user input
    departure_city = input("Please enter your Departure City or Airport:")
    arrival_city = input("Please enter your Arrival City or Airport:")
    month = int(input("Please enter the NUMBER month you are flying:"))
    day = int(input("Please enter the day of the month you are flying:"))
    user_input = {
            'departure_city': departure_city,
            'arrival_city': arrival_city,
            'date': [month, day]
        }    
    return user_input
    

In [20]:
#### MAIN
#introduction

#main menu

#request input from user
user_input = GetInput(test)

#translation input to relevant flights
relevant_airports = GetRelevantAirports(
    user_input['departure_city'],
    user_input['arrival_city'], 
    flight_registry
)
relevant_airports = EncodeAirport(airport_translation_data, relevant_airports)
flights = GetFlightsFromData(
    flight_data, 
    relevant_airports,
    user_input['date']
)
if(len(flights) == 0):
    #could not find any flights, try again
    print("\nCould not find any flights. Please try again.\n")
    # continue

#predict best flight
prediction = PredictBestFlight(
    model,
    selectedFeatures, 
    flights
)

print("\nLet's see how we performed on our predictions...")
print(GetPerformance(prediction['actuals'], prediction['predictions']))
print('\n\n')

#if we randomize the results before picking top 5, we are more likely to get a good result
top_5 = prediction['recommended_flights'].sample(frac=1).head(5)
print(top_5)
top_5 = top_5[['Airline', 'Origin', 'Dest', 'PredArrDelayMinutes', target]]
top_5 = DecodeToAirlineAndAirports(airline_translation_data, airport_translation_data, top_5)

print("There were " + str(len(flights)) + " available flights.")
print("We found " + str(len(prediction['recommended_flights'])) + " flights with the least delay.")
print("Here are the first 5:")
print(top_5)

#loop or exit

--- Start GetRelevantAirports --- 

            CityName  AirportID  CityMarketID AirportName
114  Los Angeles, CA      12892         32575         LAX
              CityName  AirportID  CityMarketID AirportName
115  San Francisco, CA      14771         32457         SFO
{'origins': 113    LGB
114    LAX
131    BUR
211    ONT
224    SNA
Name: AirportName, dtype: object, 'dests': 111    SJC
115    SFO
127    OAK
Name: AirportName, dtype: object}

--- End GetRelevantAirports --- 

--- Start TranslateAirport --- 

Origin Airports:
    Label  Translation
58    BUR           58
195   LAX          195
205   LGB          205
255   ONT          255
335   SNA          335
Dest Airports:
    Label  Translation
248   OAK          248
322   SFO          322
328   SJC          328

--- End TranslateAirport --- 

--- Start GetFlightsFromData --- 

        Airline  Origin  Dest  CRSDepTime  Distance  Year  Quarter  Month  \
112719        9     255   322        1715     438.0  2022        1      3   
