In [1]:
# File name: Common airline functions
# Purpose: Central file containing airline functions that I use nearly daily on airline studies
# Creator: Alex Deshowitz
# Project: Applicable to airline research and commonly performed actions on airline data
# Date created: 1/23/2018
# Date updated: 3/5/2020 


############################################################

In [2]:
# required libraries:

import pandas as pd
import numpy as np
import time as tm
import os
import glob
import datetime

from math import sin, cos, sqrt, atan2, radians

In [3]:
# functions for use in the notebook:

def coterminal_replacement ( airport_code_array ):
    
    ''' Function that replaces cities that we want to view as commercially "the same" 
    within our analysis in order to create one large coterminal from multiple airports '''
    
    city_code =  np.where(airport_code_array.isin(['IAH','HOU']), 'IAH', 
                          np.where(airport_code_array.isin(['MDW','ORD']),'ORD',
                          np.where(airport_code_array.isin(['DFW','DAL']), 'DFW',
                          np.where(airport_code_array.isin(['LAX','SNA', 'ONT','BUR', 'LGB']), 'LAX', 
                          np.where(airport_code_array.isin(['JFK','LGA','EWR']), 'NYC', 
                          np.where(airport_code_array.isin(['DCA','IAD']), 'DCA',
                          np.where(airport_code_array.isin(['FLL','MIA']), 'MIA', airport_code_array)))))))
    return city_code

def three_domestic_sisters_coterminal_replacement (airport_code_array ):
    
    '''Returns an array of the 3-coterminaled sister airport/cities in the US'''
    
    return np.where(airport_code_array.isin(['IAH','HOU']), 
                    'IAH', np.where(airport_code_array.isin(['MDW','ORD']),
                    'ORD', np.where(airport_code_array.isin(['DFW','DAL']), 
                    'DFW', airport_code_array)))

def airline_code_fix(carrier_array): 
    
    '''Function that overrides carrier codes in the dataset for carriers that have combined
    (via merger or acquisition) during the analysis period'''
    
    
    airline_code = np.where(carrier_array.isin(['WN','FL']), 'WN',
                   np.where(carrier_array.isin(['AS','VX']), 'AS',
                   np.where(carrier_array.isin(['UA','CO']), 'UA',
                   np.where(carrier_array.isin(['DL','NW']), 'DL',
                   np.where(carrier_array.isin(['AA','US','HP']), 'AA',carrier_array)))))
    return airline_code

def airline_code_category(carrier_array):
    
    '''Returns array or Series containing the mapped airline category that each airline fits into - NOTE: only for domestic US cxrs right now'''
    
    
    legacy = ['AA','UA','DL', 'NW','US', 'CO'] 
    lcc = ['B6', 'VX', 'AS', 'WN', 'FL']
    ulcc = ['NK', 'F9', 'G4']
    
    airline_group = np.where(carrier_array.isin(legacy), 'Legacy',
                    np.where(carrier_array.isin(lcc), 'LCC',
                    np.where(carrier_array.isin(ulcc), 'ULCC', 'Other')))
    return airline_group

def rt_market(orig_array, dest_array):
    
    '''Returns an array of half alpha markets from an orig and dest array set'''
    
    market_array = np.where(orig_array < dest_array, orig_array + '-' + dest_array, dest_array + '-' + orig_array)
    return market_array

def dir_market(orig_array, dest_array):
    
    '''Returns an array of directional-one-way markets from an orig and dest array set'''
    
    market_array = orig_array + '-' + dest_array
    return market_array

def otp_ind(dot_delay_mins, ontime_cutoff = 14, ontime_minority_class = 1 ):
    
    '''Returns boolean array from dot delay minutes for identification of "DOT Late" flights; takes df['col'] as argument;
    ontime_minority_class argument added for classification task target variable prep'''
    
    if ontime_minority_class == 1:
        ontime = np.where(dot_delay_mins <= ontime_cutoff, 1, 0)
    
    else:
        ontime = np.where(dot_delay_mins <= ontime_cutoff, 0, 1)
        
    return ontime

def split_diio_date_qtr(df, column):
    
    '''Returns an array of the quarters in a dataset from the poorly formatted Diio "date" array'''
    
    
    new_col = df[column].str.split(' ', n = 1, expand = True)[0]
    return new_col

def split_diio_date_yr(df, column):
    
    '''Returns an array of the quarters in a dataset from the poorly formatted Diio "date" array'''
    
    
    new_col = df[column].str.split(' ', n = 1, expand = True)[1]
    return new_col

def fix_columns(dataframe):
    
    '''function that takes a list of columns and modifies them to be easier to read -- assign to df.columns'''
    
    column_string_replace = ['\n','@',' ','__', '/', '-']

    columns = dataframe.columns

    columns = columns.map(lambda x: x.strip())
    columns = columns.map(lambda x : x.lower())

    for string in column_string_replace:
        columns = columns.map(lambda x : x.replace(string, '_') if isinstance (x, (str, bytes)) else x)

    return columns

def left(s, amount):
    
    '''Returns the left n characters of a string - use map and apply for an array '''
    
    
    return s[:amount]

def right(s, amount):
    
    '''Returns the right n characters of a string - use map and apply for an array '''

    
    return s[-amount:]

def mid(s, offset, amount):
    
    '''Returns the mid n characters of a string - use map and apply for an array'''
    
    return s[offset:offset+amount]

def combine_sked_files(replaceable_path, combined_file_path, delimiter = 'None', header = 2):
    '''function that combines the files in the sked files from Diio - easy to manipulate to convert different file types'''

    start = tm.time()
    
    counter = 1
    
    for file in glob.glob(replaceable_path):
        
        if counter == 1:
            print(counter, ":")
            print('reading file: ', file)
            df = pd.read_csv(file, delimiter = delimiter, header = header)
            df.columns = fix_columns(df)
            
            # cut out the null rows NOTE:SPECIFIC to Diio files:
            df = df.iloc[0:np.where(df.iloc[:,0].isnull().values == True)[0].min(),:]
            
            df['file_index'] = counter
            df.to_csv(combined_file_path, index = False)
            print('generated new file: ', combined_file_path)
            
            del df
            
        else:
            
            print(counter, ":")
            print('reading file: ', file)
            
            df = pd.read_csv(file, delimiter = delimiter, header = header)
            df.columns = fix_columns(df)
            # cut out the null rows NOTE:SPECIFIC to Diio files:
            df = df.iloc[0:np.where(df.iloc[:,0].isnull().values == True)[0].min(),:]
            
            df['file_index'] = counter
            df.to_csv(combined_file_path, mode = 'a', header = False, index = False)
            print('appended data from: ', file, 'into: ', combined_file_path)
            
            
            del df
        
        counter += 1
    end = tm.time()
    
    print('Files completed combining in : ', round((end - start) / 60), ' minutes')
    
def HHI (dataframe, level_of_detail, market_measure): 
    
    """Function that calculates and returns the HHI of each market -- NOTE THAT THIS ONLY PERTAINS TO OVERALL MARKET HHI - NO ADDL DIMENSIONALITY"""
    
    total_market = dataframe.groupby(by = level_of_detail)[market_measure].transform(sum)
    market_share = (((dataframe[market_measure] / total_market) * 100)**2) 
    
    subframe = pd.concat([dataframe[level_of_detail], market_share], axis = 1, ignore_index = True )
    
    subframe.columns = [level_of_detail, 'market_share']
    
    hhi = subframe.groupby(by = level_of_detail)['market_share'].transform(sum)
    
    return hhi

def calculate_haversine(lat_1 = 32.845945, lon_1 = -96.850876, 
                        lat_2 = 29.645417, lon_2 = -95.278888):
    
    ''' Function that calculates the great-circle (Haversine) distance between 2 locations in miles'''
    R = 6373.0 # radius of the earth
    
    lat1 = radians(lat_1)
    lon1 = radians(lon_1)
    lat2 = radians(lat_2)
    lon2 = radians(lon_2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    distance = distance * 0.621371 # convert to miles
    return distance  
