# Setup

In [1]:
import pandas as pd
import numpy as np
from numpy import nan as Nan
import xml.etree.ElementTree as ET # to read one dataset in XML format

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams

from IPython.display import Image, display

import requests
from string import digits
import wget 
import glob
import time

import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import ttest_ind, chisquare, normaltest
import patsy
import psutil

import plotly.graph_objects as go # to draw geospatial maps
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

# Data Cleaning

To explore the correlation between unpleasant flight status (delay, cancellation and diversion) and airports where flights start and end in the U.S., we need to aggregate comprehensive airport attributes and main outcome variables (delay, cancellation and diversion). Thus, in the Data Cleaning step, our goal is to wrangle useful data from collected datasets and forge them into two dataframes whose indices are distinct 3-digit airport codes of airports in the U.S. (e.g. ABE) with the following columns:

   1. `merged_X`: attributes of each airport (23 columns)
         - total_departure: total number of flights originated from this airport 
         - total_arrival: total number of flights arriving at this airport
         - departure_distance_avg: average distance of all flights originated from this airport
         - arrival_distance_avg: average distance of all flights arriving at this airport
         - departure_taxi_avg: average taxi time of all flights originated from this airport
         - arrival_taxi_avg: average taxi time of all flights arriving at this airport
         - city_name: city where the airport locates
         - code4: 4-digit airport code
         - latitude: latitude of the airport
         - longitude: longitude of the airport
         - altitude_ft: altitude of the airport
         - city_id: index of this city in uscities_df
         - fips: county-level FIPS code of the airport
         - population: population of the city near the airport
         - temp_avg: average tempreature of the county contining the airport in 2018
         - pcp_avg: average precipitation of the county contining the airport in 2018
         - strike_avg: year-average number bird strike (may not caused damage) during 2000~2011
         - damage_avg: year-average number of bird strike that caused damage during 2000~2011
         - enplanements: enplanements of the city near the airport in 2018
         - length_ft_sum: total length of all runways in the airport
         - width_ft_avg: average width of all runways in the airport
         - runway_count: number of runways in the airport
         - security_avg: average security wait time of the airport
         
         
   2. `Y`: unpleasant flight status of each airport.
         - departure_delay_avg: average delay of all flights originated from this airport 
         - arrival_delay_avg: average delay of all flights arriving at this airport
         - cancelled_avg: average cancel rate of all flights originated from this airport
         - diverte_avg: average diverted rate of all flights arriving at this airport


#### Generate 'total_departure', 'total_arrival', 'departure_distance_avg', 'arrival_distance_avg', 'departure_taxi_avg', 'arrival_taxi_avg' of `merged_X` and all columns of `Y`

Import `datasets/original/delay/2018.csv` which describes detailed unpleasant information of each flight in the U.S.. For the purpose of research question, we select useful columns and transform flight info to airport info.

In [2]:
delay_2018_df = pd.read_csv("datasets/original/delay/2018.csv") # dataset 1
delay_2018_df = delay_2018_df[["ORIGIN","DEST","DEP_DELAY","TAXI_OUT","CANCELLED","DISTANCE","ARR_DELAY","TAXI_IN","DIVERTED"]] # columns of interest

In [3]:
delay_2018_df.head()

Unnamed: 0,ORIGIN,DEST,DEP_DELAY,TAXI_OUT,CANCELLED,DISTANCE,ARR_DELAY,TAXI_IN,DIVERTED
0,EWR,DEN,-5.0,15.0,0.0,1605.0,-23.0,10.0,0.0
1,LAS,SFO,-8.0,11.0,0.0,414.0,-24.0,7.0,0.0
2,SNA,DEN,-5.0,15.0,0.0,846.0,-13.0,5.0,0.0
3,RSW,ORD,6.0,19.0,0.0,1120.0,-2.0,6.0,0.0
4,ORD,ALB,20.0,13.0,0.0,723.0,14.0,10.0,0.0


First,we check the distribution of missing values:

In [4]:
delay_2018_df.isna().sum()

ORIGIN            0
DEST              0
DEP_DELAY    117234
TAXI_OUT     115830
CANCELLED         0
DISTANCE          0
ARR_DELAY    137040
TAXI_IN      119246
DIVERTED          0
dtype: int64

There are missing values for delay columns. However, we discovered that 95% of the missing value have "CANCALLED" == 1. It is logical to have missing delay value when the flight is never done. For these flight have already contributed to the cancel rate feature, thus should not have a effect on the delay. Since we are using the .mean() function, which will ignore all nan values, we left these rows unchanged. Same for diverted, when there shouldn't be a arrival delay because the flight is not arriving at the scheduled airport. We kept them because we can have a more accurate total departure/arrival number. In this way we have only 4000 rows that we can't explain the reason behind missing value. Since we have 7213446 rows in total, 4000 is a acceptable amount of missing. We also kept it for accurate total departure/arrival number.
We then calculated departure_delay, arrival_delay and other columns of interest. 

In [5]:
# Generate total_departure, departure_delay_avg, departure_taxi_avg, departure_cancelled_avg, departure_distance_avg data by counting flights with target airport as originated airport
unpleasant_2018_departure = pd.DataFrame()
unpleasant_2018_departure['total_departure'] = delay_2018_df.loc[:,["ORIGIN"]].groupby('ORIGIN').size()
unpleasant_2018_departure[["departure_delay_avg","departure_taxi_avg"]] = delay_2018_df.loc[:,["ORIGIN","DEP_DELAY","TAXI_OUT"]].groupby('ORIGIN').mean()
unpleasant_2018_departure['departure_cancelled_avg'] = delay_2018_df.loc[:,["ORIGIN","CANCELLED"]].groupby('ORIGIN').mean()
unpleasant_2018_departure['departure_distance_avg'] = delay_2018_df.loc[:,["ORIGIN","DISTANCE"]].groupby('ORIGIN').mean()
    
# Generate total_arrival, arrival_delay_avg, arrival_taxi_avg, arrival_diverted_avg, arrival_distance_avg data by counting flights with target airport as arriving airport
unpleasant_2018_arrival = pd.DataFrame()
unpleasant_2018_arrival['total_arrival'] = delay_2018_df.loc[:,["DEST"]].groupby('DEST').size()
unpleasant_2018_arrival[["arrival_delay_avg","arrival_taxi_avg"]] = delay_2018_df.loc[:,["DEST","ARR_DELAY","TAXI_IN"]].groupby('DEST').mean()
unpleasant_2018_arrival['arrival_diverted_avg'] = delay_2018_df.loc[:,["DEST","DIVERTED"]].groupby('DEST').mean()
unpleasant_2018_arrival['arrival_distance_avg'] = delay_2018_df.loc[:,["DEST","DISTANCE"]].groupby('DEST').mean()

# Merge all departure and arrival info and select all airports with complete departure and arrival attributes. Save them in delay_2018_df
delay_2018_df = unpleasant_2018_departure.merge(unpleasant_2018_arrival,left_index=True,right_index=True)
delay_2018_df.index.names = ["airport_code"]

In [6]:
delay_2018_df.head()

Unnamed: 0_level_0,total_departure,departure_delay_avg,departure_taxi_avg,departure_cancelled_avg,departure_distance_avg,total_arrival,arrival_delay_avg,arrival_taxi_avg,arrival_diverted_avg,arrival_distance_avg
airport_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ABE,4168,11.945071,15.095051,0.020873,619.573417,4165,5.55826,5.03768,0.004562,619.518367
ABI,2022,8.027259,13.50631,0.020277,158.0,2022,5.784016,3.747347,0.000989,158.0
ABQ,24047,8.635997,12.688534,0.009897,678.490456,24048,5.599697,5.385894,0.001747,678.586452
ABR,745,7.742198,19.377205,0.010738,257.0,745,3.716621,4.771739,0.002685,257.0
ABY,1018,15.052261,15.779543,0.010806,145.0,1018,10.642137,3.673287,0.006876,145.0


By extracting 'departure_delay_avg', 'arrival_delay_avg', 'departure_cancelled_avg' and 'arrival_diverted_avg' columns in `delay_2018_df`, we get all columns of `Y`.

In [11]:
Y = delay_2018_df[["departure_delay_avg", "arrival_delay_avg", "departure_cancelled_avg", "arrival_diverted_avg"]]

Y.head()

Unnamed: 0_level_0,departure_delay_avg,arrival_delay_avg,departure_cancelled_avg,arrival_diverted_avg
airport_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABE,11.945071,5.55826,0.020873,0.004562
ABI,8.027259,5.784016,0.020277,0.000989
ABQ,8.635997,5.599697,0.009897,0.001747
ABR,7.742198,3.716621,0.010738,0.002685
ABY,15.052261,10.642137,0.010806,0.006876


In [12]:
len(Y)

358

Here we can conclude that there are 358 airports in the U.S. with complete outcome variables for the research question. Save airport codes of these airports (indices of `Y`) in `unpleasant_airport_code_df`. Then, when finding data of `X`, we only need to care about airports in `unpleasant_airport_code_df`.

In [15]:
unpleasant_airport_code_df = Y[[]]

unpleasant_airport_code_df

ABE
ABI
ABQ
ABR
ABY
...
WYS
XNA
YAK
YNG
YUM


The rest of columns in `delay_2018_df` are features of `merged_X`. Let's save them to `X` temporarily.

In [16]:
# extract X features
X = pd.DataFrame()
X[["total_departure","total_arrival", "departure_distance_avg","arrival_distance_avg","departure_taxi_avg","arrival_taxi_avg"]] \
= delay_2018_df.loc[:,["total_departure","total_arrival","departure_distance_avg","arrival_distance_avg","departure_taxi_avg","arrival_taxi_avg"]] 

In [18]:
X.head()

Unnamed: 0_level_0,total_departure,total_arrival,departure_distance_avg,arrival_distance_avg,departure_taxi_avg,arrival_taxi_avg
airport_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABE,4168,4165,619.573417,619.518367,15.095051,5.03768
ABI,2022,2022,158.0,158.0,13.50631,3.747347
ABQ,24047,24048,678.490456,678.586452,12.688534,5.385894
ABR,745,745,257.0,257.0,19.377205,4.771739
ABY,1018,1018,145.0,145.0,15.779543,3.673287


#### Generate 'city_name', 'code4', 'latitude', 'longitude' and 'attitude_ft' of `merged_X`

Import `datasets/original/airport/airports-extended.csv`. Select useful columns of U.S. airports. Drop rows with NaN value.

Note that we need 4-digit airport code ('code4') is important since airports are represented by their 4-digit airport codes in runways data, so we can only extract runways data with 'code4' column.

In [28]:
airport_loc_df = pd.read_csv("datasets/original/airport/airports-extended.csv", names=["ID","name","city_name","country","airport_code","code4","latitude","longitude","altitude_ft","UTC_offset","DST","timezone","type","information_source"])

airport_loc_df.head()

Unnamed: 0,ID,name,city_name,country,airport_code,code4,latitude,longitude,altitude_ft,UTC_offset,DST,timezone,type,information_source
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


In [29]:
airport_loc_df = airport_loc_df[airport_loc_df["country"]=="United States"]
airport_loc_df = airport_loc_df.loc[:,["city_name","airport_code","code4", "latitude","longitude","altitude_ft"]]
airport_loc_df = airport_loc_df[airport_loc_df["airport_code"]!="\\N"]
airport_loc_df = airport_loc_df.set_index("airport_code")

airport_loc_df.head()

Unnamed: 0_level_0,city_name,code4,latitude,longitude,altitude_ft
airport_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BTI,Barter Island,PABA,70.134003,-143.582001,2
K03,Fort Wainwright,PAWT,70.613403,-159.860001,35
LUR,Cape Lisburne,PALU,68.875099,-166.110001,16
PIZ,Point Lay,PPIZ,69.732903,-163.005005,22
ITO,Hilo,PHTO,19.721399,-155.048004,38


We then apply the function clean_city_name to standardize the 'city_name' column.

In [30]:
def clean_city_name(input_city):
    original = input_city
    input_city = str(input_city)
    input_city = input_city.strip()
    input_city = input_city.lower()
    
    input_city = input_city.replace(".","")
    input_city = input_city.replace("\\\\","")
    input_city = input_city.replace("-"," ")
    input_city = input_city.replace(" - "," ")
    input_city = input_city.replace("saint ","st")
    input_city = input_city.replace("east ","")
    input_city = input_city.replace("west ","")
    
    input_city = input_city.translate({ord(k): None for k in digits})
    
    if ('/' in input_city):
        input_city = input_city[:input_city.find('/')] # in case city have muitiple names like "cityname1/cityname2"
    if ('(' in input_city):
        input_city = input_city[:input_city.find('(')] # Same as above
    if (',' in input_city):
        input_city = input_city[:input_city.find(',')] 
    input_city = input_city.strip()   
    if (' ' in input_city):
        temp=input_city.find(' ')
        if (temp > 2):
            input_city = input_city[:input_city.find(' ')]
        else:
            if (input_city.find(' ',temp+1) != -1):
                input_city = input_city[temp+1:input_city.find(' ',temp+1)]
            else:
                input_city = input_city[temp+1:]
    input_city = input_city.strip()
    try:
        assert len(input_city) > 2
        assert input_city.replace(" ","").replace("'","").isalpha()
    except:
        #print("This city name is prehaps incorrect: ",original,input_city,len(original))
        1+1
    return input_city

In [31]:
airport_loc_df["city_name"] = airport_loc_df["city_name"].apply(clean_city_name)# clean city name

airport_loc_df

Unnamed: 0_level_0,city_name,code4,latitude,longitude,altitude_ft
airport_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BTI,barter,PABA,70.134003,-143.582001,2
K03,fort,PAWT,70.613403,-159.860001,35
LUR,cape,PALU,68.875099,-166.110001,16
PIZ,point,PPIZ,69.732903,-163.005005,22
ITO,hilo,PHTO,19.721399,-155.048004,38
...,...,...,...,...,...
XMR,cocoa,KXMR,28.467600,-80.566597,10
ZZV,zanesville,KZZV,39.944401,-81.892097,900
ENN,nenana,PANN,64.547302,-149.074005,362
WWA,wasilla,PAWS,61.571701,-149.539993,354


#### 2.4 

In [None]:
us_cities_df = pd.read_csv("datasets/original/city/uscities.csv")

In [None]:
us_cities_df.head()

In [None]:
# Kept latitude and longitude to differentiate cities with same name
us_cities_df = us_cities_df[["city","state_id","county_fips","county_name","population","lat","lng"]] 
us_cities_df = us_cities_df.rename(columns = {"city":"city_name"})                                              
us_cities_df["fips"] = us_cities_df["county_fips"]

def get_county_code(input_county):
    return int(input_county) % 1000

us_cities_df["county_fips"] = us_cities_df["county_fips"].apply(get_county_code)
us_cities_df["city_name"] = us_cities_df["city_name"].apply(clean_city_name)

In [None]:
us_cities_df

#### 2.5 

In [None]:
airport_prop_df = unpleasant_airport_code_df.merge(airport_loc_df,how='inner',left_index=True,right_index=True)

city_search_df = pd.DataFrame(columns=["airport_code","state_id","county_id","city_id","fips","population"])
for ind,row in airport_prop_df.iterrows():
    city = row["city_name"]
    target_lat = row["latitude"]
    target_lng = row["longitude"]
    try:      
        target_cities = us_cities_df[us_cities_df["city_name"]==city]
        
        if not (target_cities.shape[0] == 1): # If there are multiple city with same name
            def calc_dis(input_):
                err = abs(target_lat - input_["lat"]) + abs(target_lng - input_["lng"])
                return err
            target_cities.loc[:,"error"] = (target_cities.apply(calc_dis,axis=1))
            target_city = target_cities.sort_values(by="error").iloc[0]
            
            assert target_city["error"] < 1.5 # assert the error should be <1.5 degs ~ 40 miles.
            
            target_city = target_city.drop(["error"])
        elif (target_cities.shape[0] >= 1):
            target_city = target_cities.iloc[0]
        
        county = str(target_city["county_fips"])
        if (len(county)==1):
            county = "00" + county
        elif (len(county)==2):
            county = "0" + county

        city_search_df = city_search_df.append({"airport_code":ind,"state_id":target_city["state_id"],"county_id":county,"city_id":target_city.name,"fips":target_city["fips"],"population":target_city["population"]},ignore_index=True)        
    except:
        try:
            def calc_dis(input_):
                err = abs(target_lat - input_["lat"]) + abs(target_lng - input_["lng"])
                return err
            us_cities_df_copy = us_cities_df
            us_cities_df_copy.loc[:,"error"] = (us_cities_df.apply(calc_dis,axis=1))
            target_city = us_cities_df_copy.sort_values(by="error").iloc[0]
            assert target_city["error"] < 1.5
            county = str(target_city["county_fips"])
            if (len(county)==1):
                county = "00" + county
            elif (len(county)==2):
                county = "0" + county
            city_search_df = city_search_df.append({"airport_code":ind,"state_id":target_city["state_id"],"county_id":county,"city_id":target_city.name,"fips":target_city["fips"],"population":target_city["population"]},ignore_index=True)
        except:
            print("No data for ",city)
            city_search_df = city_search_df.append({"airport_code":ind,"state_id":np.nan,"county_id":np.nan,"city_id":np.nan,"fips":np.nan,"population":np.nan},ignore_index=True)

#special case for DC
for ind,row in city_search_df.iterrows():
    if (row["airport_code"]=="DCA"):
        city_search_df.iloc[ind]["state_id"]="MD"
        city_search_df.iloc[ind]["county_id"]="511"

# Remove Hawaii and Alaska
city_search_df = city_search_df[(city_search_df["state_id"]!="HI") & (city_search_df["state_id"]!="AK")]
airport_prop_df = airport_prop_df.merge(city_search_df.set_index("airport_code").loc[:,["city_id","fips","population"]],left_index=True,right_index=True)

In [None]:
city_search_df

In [None]:
airport_prop_df # prop: property

#### 2.6 

In [None]:
def download_climate_data(state,county,year):
    save_path = "datasets/original/weather/"
    fname = state + county + "_" + str(year) + ".csv"
    if (len(glob.glob(save_path + fname))==0):
        URL = "https://www.ncdc.noaa.gov/cag/county/time-series/{}-{}-{}-all-1-2000-2020.csv?base_prd=true&begbaseyear=1901&endbaseyear=2000".format(state,county,"tavg")
        r = requests.get(URL)
        file = wget.download(URL,out=save_path + "tavg/tavg_" + fname)
        URL = "https://www.ncdc.noaa.gov/cag/county/time-series/{}-{}-{}-all-1-2000-2020.csv?base_prd=true&begbaseyear=1901&endbaseyear=2000".format(state,county,"pcp")
        r = requests.get(URL)
        file = wget.download(URL,out=save_path + "pcp/pcp_" + fname)

        tavg_df = pd.read_csv(save_path + "tavg/tavg_" + fname).iloc[4:]
        tavg_df.columns=["date","tavg","comp"]
        tavg = tavg_df.set_index("date")["tavg"]

        pcp_df = pd.read_csv(save_path + "pcp/pcp_" + fname).iloc[4:]
        pcp_df.columns=["date","pcp","comp"]
        pcp = pcp_df.set_index("date")["pcp"]

        pd.concat([tavg, pcp], axis=1).to_csv(save_path + fname)
        time.sleep(1) # not requesting too frequently

In [None]:
counter = 0
# Download data according to city_search_df
for ind,row in city_search_df.iterrows():
    try:
        download_climate_data(row["state_id"],row["county_id"],2018)
#        download_climate_data(row["state_id"],row["county_id"],2019)
    except:
        print(row)
    counter+=1
    print("progress: {:.2f}%   Just done: {}".format(100 * counter / city_search_df.shape[0],row["airport_code"]),end="\r")

# Process downloaded data
temp_pcp_df = pd.DataFrame(columns=["airport_code","temp_avg","pcp_avg"])
for ind,row in city_search_df.iterrows():
    state = row["state_id"]
    county = row["county_id"]
    save_path = "datasets/original/weather/"
    
    try:
        fname = state + county + "_2018.csv" # change it to 2018 only --YD
        temp_pcp = pd.read_csv(save_path + fname)
        tavg = temp_pcp.mean()["tavg"]
        pcp = temp_pcp.mean()["pcp"]
        
        temp_pcp_df = temp_pcp_df.append({"airport_code":row["airport_code"],"temp_avg":tavg,"pcp_avg":pcp},ignore_index=True)
        
    except:
        temp_pcp_df = temp_pcp_df.append({"airport_code":row["airport_code"],"temp_avg":np.nan,"pcp_avg":np.nan},ignore_index=True)

In [None]:
airport_prop_df = airport_prop_df.merge(temp_pcp_df.set_index("airport_code"),left_index=True,right_index=True)
airport_prop_df

#### 2.7 

In [None]:
bird_strike_df = pd.read_excel("datasets/original/airport/Bird Strikes.xlsx") #data6
airport_name_df = pd.read_excel("datasets/original/airport/airportcode.xlsx") #data7 

In [None]:
bird_strike_df.head()

In [None]:
bird_strike_df = bird_strike_df[["Airport: Name", "Effect: Indicated Damage"]]
bird_strike_df = bird_strike_df.rename(columns = {"Airport: Name": "airport_name", "Effect: Indicated Damage":"bird_strike_effect"})
bird_strike_df = bird_strike_df.dropna()
bird_strike_df = bird_strike_df.reset_index(drop = True)
bird_strike_df.head()

In [None]:
airport_name_df.head()

In [None]:
airport_name_df = airport_name_df.dropna()
airport_name_df = airport_name_df.reset_index(drop = True)
airport_name_df.head()

In [None]:
def standardize_airport_name(string):
    
    string = string.lower()
    string = string.strip()
    if 'intl' in string:
        string = string.replace('intl', '')
    if 'arpt' in string:
        string = string.replace('arpt', '')
    if 'regional' in string:
        string = string.replace('regional', '')
    if 'airport' in string:
        string = string.replace('airport', '')
    if 'sunport' in string:
        string = string.replace('sunport', '')
    if 'international' in string:
        string = string.replace('international', '')
    if 'intercontinental' in string:
        string = string.replace('intercontinental', '')
    else:
        output = string
        
    string = string = string.strip()
    
    return string

In [None]:
bird_strike_df['airport_name'] = bird_strike_df['airport_name'].apply(standardize_airport_name)
airport_name_df['airport_name'] = airport_name_df['airport_name'].apply(standardize_airport_name)

def check_strike (string):
    return 1

def check_damage (string):
    if 'Caused' in string:
        output = 1
    else:
        output = 0
    return output

bird_strike_df['strike'] = bird_strike_df['bird_strike_effect'].apply(check_strike)
bird_strike_df['damage'] = bird_strike_df['bird_strike_effect'].apply(check_damage)
bird_strike_df.head()

In [None]:
grouped_strike = bird_strike_df.groupby('airport_name').agg({'strike':['sum']})
grouped_strike = grouped_strike.reset_index()
grouped_damage = bird_strike_df.groupby('airport_name').agg({'damage':['sum']})

bird_strike_sum_df = pd.merge(grouped_strike, grouped_damage, on='airport_name')
bird_strike_sum_df.columns = ['airport_name', 'strike_sum','damage_sum']

bird_strike_avg_df = pd.merge(airport_name_df, bird_strike_sum_df, on='airport_name')

def average_sum(input):
    output = input/(2011 - 2000 + 1)
    return output

bird_strike_avg_df['strike_avg'] = bird_strike_avg_df['strike_sum'].apply(average_sum)
bird_strike_avg_df['damage_avg'] = bird_strike_avg_df['damage_sum'].apply(average_sum)
bird_strike_avg_df = bird_strike_avg_df.drop(columns = ['strike_sum', 'damage_sum'])

bird_strike_avg_df= bird_strike_avg_df.drop(columns = ["airport_name"])
bird_strike_avg_df.head()

We later need to merge this dataframe to `airport_prop_df`. However, not all airports have bird strike. Therefore, we used a right merge(that kept all rows from `airport_prop_df`) and filled nan with 0. 

In [None]:
bird_strike_final_df = pd.merge(bird_strike_avg_df, unpleasant_airport_code_df.reset_index(), how='right')
# unpleasant_airport_code_df have the same index as airport_prop_df
bird_strike_final_df = bird_strike_final_df.fillna(0)
bird_strike_final_df = bird_strike_final_df.set_index("airport_code")
bird_strike_final_df.head()

#### 2.8 

In [None]:
airport_runways_df=pd.read_csv("datasets/original/airport/runways.csv")

airport_runways_df.head()

In [None]:
airport_runways_df = airport_runways_df.rename(columns = {"airport_ident":"code4"})
airport_runways_df = pd.merge(airport_runways_df, airport_loc_df.reset_index(), how = 'inner', on = 'code4')
airport_runways_df = airport_runways_df[["code4", "airport_code", "length_ft", "width_ft"]]
airport_runways_df.head()

In [None]:
runways_sum_df = airport_runways_df.groupby(['airport_code']).sum()
runways_mean_df = airport_runways_df.groupby(['airport_code']).mean()
runways_count_df = airport_runways_df.groupby(['airport_code']).count()
runway_final_df = runways_sum_df["length_ft"].to_frame().join(runways_mean_df["width_ft"].to_frame())
runway_final_df["count"] = runways_count_df["length_ft"]
runway_final_df.columns = ["length_ft_sum", "width_ft_avg","runway_count"]

runway_final_df.head()

#### 2.9 

In [None]:
enplanements_df = pd.read_excel('datasets/original/city/commercial_service_enplanements.xlsx')

In [None]:
enplanements_df.head()

In [None]:
enplanements_df = enplanements_df.rename(columns = {"Locid":"airport_code","CY 18 Enplanements":"enplanements"})
enplanements_df = enplanements_df.set_index("airport_code")
enplanements_df = enplanements_df[["enplanements"]] # change it to only 2018 --YD

enplanements_df.head()

#### 2.10 

In [None]:
security_df = pd.read_excel("datasets/original/airport/security_wait_times.xls")

In [None]:
security_df = security_df.fillna(0)
security_df = security_df.rename(columns={"Code":"airport_code"})
security_df

In [None]:
security_df["security_avg"] = 0.0

In [None]:
def calculate_security_avg(df):
    for index,row in df.iterrows():
        row_sub = row[3:128]
        time = 0
        sums = 0
        for i in row_sub:
            if i>0:
                time = time + 1
                sums = sums + i
        df.at[index,"security_avg"] = sums/time
    return df

In [None]:
security_df = calculate_security_avg(security_df)
security_df = security_df[["airport_code", "security_avg"]].groupby("airport_code").sum()
security_df = security_df.drop(0)
security_df

In [None]:
security_df = pd.concat([security_df, unpleasant_airport_code_df], axis=1, sort=False)
security_df = security_df.dropna()
security_df

### Final Cleaning step: merging all dataframes together to get X and Y.

In [None]:
merged_X_df = pd.concat([X,airport_prop_df, bird_strike_final_df, enplanements_df, runway_final_df, security_df], axis=1, sort=False)
merged_X_df = merged_X_df.dropna()
merged_X_df

In [None]:
merged_X_df.columns

In [None]:
Y = delay_2018_df[["departure_delay_avg", "arrival_delay_avg", "departure_cancelled_avg", "arrival_diverted_avg"]]
Y = Y.merge(merged_X_df[[]],how="right",left_index=True,right_index=True)
Y

In [None]:
merged_X_df.to_csv("datasets/merged/merged_X.csv")
Y.to_csv("datasets/merged/Y.csv")