# Bird Strike Cleaning

In [406]:
import pandas as pd
import numpy as np
import datetime as dt

In [407]:
df = pd.read_excel("datasets/airports/Bird Strikes.xlsx")
df = df[['Airport: Name', 'FlightDate', 'Effect: Indicated Damage']]
df

Unnamed: 0,Airport: Name,FlightDate,Effect: Indicated Damage
0,LAGUARDIA NY,2000-11-23,Caused damage
1,DALLAS/FORT WORTH INTL ARPT,2001-07-25,Caused damage
2,LAKEFRONT AIRPORT,2001-09-14,No damage
3,SEATTLE-TACOMA INTL,2002-09-05,No damage
4,NORFOLK INTL,2003-06-23,No damage
...,...,...,...
25553,REDDING MUNICIPAL,2011-12-30,No damage
25554,ORLANDO INTL,2011-12-30,No damage
25555,,NaT,No damage
25556,DETROIT METRO WAYNE COUNTY ARPT,2011-12-31,No damage


In [408]:
df = df.dropna()

In [409]:
# This cell of code select data points of Year 2009 only.


In [410]:
# Extract data of airport names and damage
df = df.reset_index(drop = True)
df = df.drop(columns = ['FlightDate'])
df.columns = ['airport_name', 'strike_effect']
df

Unnamed: 0,airport_name,strike_effect
0,LAGUARDIA NY,Caused damage
1,DALLAS/FORT WORTH INTL ARPT,Caused damage
2,LAKEFRONT AIRPORT,No damage
3,SEATTLE-TACOMA INTL,No damage
4,NORFOLK INTL,No damage
...,...,...
25424,SACRAMENTO INTL,No damage
25425,REDDING MUNICIPAL,No damage
25426,ORLANDO INTL,No damage
25427,DETROIT METRO WAYNE COUNTY ARPT,No damage


In [411]:
df_code = pd.read_excel("datasets/airports/airportcode.xlsx")
df_code = df_code.dropna()

In [412]:
#standardize airport names in df and df_code so we can merge them later

def standardize_name (string):
    
    string = string.lower()
    string = string.strip()
    if 'intl' in string:
        string = string.replace('intl', '')
    if 'arpt' in string:
        string = string.replace('arpt', '')
    if 'regional' in string:
        string = string.replace('regional', '')
    if 'airport' in string:
        string = string.replace('airport', '')
    if 'sunport' in string:
        string = string.replace('sunport', '')
    if 'international' in string:
        string = string.replace('international', '')
    if 'intercontinental' in string:
        string = string.replace('intercontinental', '')
    else:
        output = string
        
    string = string = string.strip()
    
    return string

df_code['airport_name'] = df_code['airport_name'].apply(standardize_name)
df['airport_name'] = df['airport_name'].apply(standardize_name)
df_code

Unnamed: 0,airport_code,airport_name
0,ABE,lehigh valley
1,ABI,abilene
2,ABQ,albuquerque
3,ABY,southwest georgia
4,ACK,nantucket memorial
...,...,...
754,WYS,west yellowstone
755,XNA,northwest arkansas
756,YAK,yakutat
757,YKM,yakima air terminal


In [413]:
# Quantify the strike data and the damage data separately

def check_strike (string):
    return 1

def check_damage (string):
    if 'Caused' in string:
        output = 1
    else:
        output = 0
    return output

df['strike'] = df['strike_effect'].apply(check_strike)
df['damage'] = df['strike_effect'].apply(check_damage)


In [414]:
# Calculate how many strikes and strikes causing damage for each airport

grouped_strike = df.groupby('airport_name').agg({'strike':['sum']})
grouped_strike = grouped_strike.reset_index()
grouped_damage = df.groupby('airport_name').agg({'damage':['sum']})

In [415]:
df_merged = pd.merge(grouped_strike, grouped_damage, on='airport_name')
df_merged.columns = ['airport_name', 'strike_sum','damage_sum']
df_merged

Unnamed: 0,airport_name,strike_sum,damage_sum
0,aberdeen ar,17,1
1,abilene,9,1
2,abraham lincoln capital,30,4
3,adams county- legion field,1,1
4,adams field,67,5
...,...,...,...
1100,yellowstone,5,0
1101,youngstown-warren rgnl,1,0
1102,zamperini field,1,0
1103,zelienople municipal,1,1


In [416]:
df_final = pd.merge(df_code, df_merged, on='airport_name')
df_final

Unnamed: 0,airport_code,airport_name,strike_sum,damage_sum
0,ABE,lehigh valley,63,2
1,ABI,abilene,9,1
2,ABQ,albuquerque,82,3
3,ABY,southwest georgia,5,1
4,ACK,nantucket memorial,7,1
...,...,...,...,...
212,TYS,mcghee tyson,41,2
213,UNV,university park,4,2
214,VLD,valdosta,3,2
215,YAK,yakutat,6,3


In [417]:
def average_sum(input):
    output = input/(2011 - 2000 + 1)
    return output

df_final['strike_avg'] = df_final['strike_sum'].apply(average_sum)
df_final['damage_avg'] = df_final['damage_sum'].apply(average_sum)
df_final = df_final.drop(columns = ['strike_sum', 'damage_sum'])
df_final.set_index('airport_code')

Unnamed: 0_level_0,airport_name,strike_avg,damage_avg
airport_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABE,lehigh valley,5.250000,0.166667
ABI,abilene,0.750000,0.083333
ABQ,albuquerque,6.833333,0.250000
ABY,southwest georgia,0.416667,0.083333
ACK,nantucket memorial,0.583333,0.083333
...,...,...,...
TYS,mcghee tyson,3.416667,0.166667
UNV,university park,0.333333,0.166667
VLD,valdosta,0.250000,0.166667
YAK,yakutat,0.500000,0.250000


In [418]:
df_need = pd.read_csv("cities_need_bird.csv")
df_need

Unnamed: 0,airport_code,city_name
0,ABE,allentown
1,ABI,abilene
2,ABQ,albuquerque
3,ABY,albany
4,ACK,nantucket
...,...,...
291,WRG,wrangell
292,WYS,west yellowstone
293,XNA,bentonville
294,YAK,yakutat


In [419]:
# replace nan to zero and drop columns that we do not need
df_final= pd.merge(df_need, df_final, how='left')
df_final= df_final.fillna(0)
df_final = df_final.drop(columns = ['city_name', 'airport_name'])
df_final = df_final.set_index('airport_code')
df_final

Unnamed: 0_level_0,strike_avg,damage_avg
airport_code,Unnamed: 1_level_1,Unnamed: 2_level_1
ABE,5.250000,0.166667
ABI,0.750000,0.083333
ABQ,6.833333,0.250000
ABY,0.416667,0.083333
ACK,0.583333,0.083333
...,...,...
WRG,0.000000,0.000000
WYS,0.000000,0.000000
XNA,0.000000,0.000000
YAK,0.500000,0.250000
