In [97]:
import numpy as np
import pandas as pd

flights = pd.read_csv("Cleaned_2018_Flights.csv")
airports = pd.read_csv("airports.csv")
trails = pd.read_csv("AllTrails data - nationalpark.csv")

In [55]:
# Finding all airports within the Flight data
arrs = list(flights['Origin'].unique())
deps = list(flights['Dest'].unique())

print("Not in Deps:", str(set(arrs) - set(deps)))
print("Not in Arrs:", str(set(deps) - set(arrs)))

total_airports = list(arrs + list(set(deps) - set(arrs)))
print("Number of airports:",str(len(total_airports)))

Not in Deps: {'JHM', 'SHD', 'RKS', 'FLG', 'SPS', 'COU', 'MKK', 'ABI', 'PRC', 'SLN'}
Not in Arrs: {'OTH', 'BRO', 'ITH', 'SBP', 'MBS', 'DRO', 'LWB'}
Number of airports: 270


In [61]:
# Finding all airports in the LAT LONG data
airports_latlong = airports[['IATA','LATITUDE','LONGITUDE']]
latlong_df = airports_latlong[airports_latlong['IATA'].isin(total_airports)]

In [71]:
# Filtering out flights with no lat long data
count = len(flights['Origin'])
flight_df = flights[flights['Origin'].isin(latlong_df['IATA']) & flights['Dest'].isin(latlong_df['IATA'])]
new = len(flight_df['Origin'])
print("Number of flights removed:", str(count - new),'.....', str(round((new/count)*100,2))+'% remaining.')

Number of flights removed: 210174 ..... 97.8% remaining.


In [99]:
# Extracting Lat and Long info from the Trails table
# Reference: https://stackoverflow.com/questions/39169718/convert-string-to-dict-then-access-keyvalues-how-to-access-data-in-a-class 
import ast

trails_loc = trails["_geoloc"].astype('str')
trails_loc = trails_loc.apply(lambda x: ast.literal_eval(x))
trails_loc = trails_loc.apply(pd.Series)
trails = trails.drop(['_geoloc'], axis=1)
trails['Lat'] = trails_loc['lat']
trails['Long'] = trails_loc['lng']

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,popularity,length,elevation_gain,difficulty_rating,route_type,visitor_usage,avg_rating,num_reviews,features,activities,units,Lat,Long
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,24.8931,15610.598,1161.8976,5,out and back,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i,60.18852,-149.63156
1,10236086,Mount Healy Overlook Trail,Denali National Park,Denali National Park,Alaska,United States,18.0311,6920.162,507.7968,3,out and back,1.0,4.5,260,"['dogs-no', 'forest', 'views', 'wild-flowers',...","['birding', 'camping', 'hiking', 'nature-trips...",i,63.73049,-148.91968
2,10267857,Exit Glacier Trail,Kenai Fjords National Park,Seward,Alaska,United States,17.7821,2896.812,81.9912,1,out and back,3.0,4.5,224,"['dogs-no', 'partially-paved', 'views', 'wildl...","['hiking', 'walking']",i,60.18879,-149.631
3,10236076,Horseshoe Lake Trail,Denali National Park,Denali National Park,Alaska,United States,16.2674,3379.614,119.7864,1,loop,2.0,4.5,237,"['dogs-no', 'forest', 'lake', 'kids', 'views',...","['birding', 'hiking', 'nature-trips', 'trail-r...",i,63.73661,-148.915
4,10236082,Triple Lakes Trail,Denali National Park,Denali National Park,Alaska,United States,12.5935,29772.79,1124.712,5,out and back,1.0,4.5,110,"['dogs-no', 'lake', 'views', 'wild-flowers', '...","['birding', 'fishing', 'hiking', 'nature-trips...",i,63.73319,-148.89682


In [117]:
# Finding nearest airport to each hike
# Reference: https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
from math import radians, cos, sin, asin, sqrt
import time

def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956 # Radius of earth in miles
    return c * r

start = time.time()
nearest_airport = ['test']
distance = [0]
for index, row in trails.iterrows():
    trail_lat = row['Lat']
    trail_lon = row['Long']
    min_dist = 0
    min_port = ''
    
    for index2, row2 in latlong_df.iterrows():
        airport_lat = row2['LATITUDE']
        airport_lon = row2['LONGITUDE']
        
        curr_dist = haversine(trail_lon, trail_lat, airport_lon, airport_lat)
        
        if min_dist == 0 or curr_dist < min_dist:
            min_dist = curr_dist
            min_port = row2['IATA']
            
    #print('Found:',str(min_dist),'-',min_port)
    
    nearest_airport.append(min_port)
    distance.append(min_dist)
    
end = time.time()
print('Found',str(len(distance)-1),'matches in',str(round(end-start,0)),'seconds.')

nearest_airport.pop(0)
distance.pop(0)

trails['Nearest Airport'] = nearest_airport
trails['Distance to Airport'] = distance

Found 3313 matches in 77.0 seconds.


In [128]:
# Review final dataframes

# Flights
flights = flight_df
flights.head(1)

Unnamed: 0.1,Unnamed: 0,ItinID,MktID,MktCoupons,Quarter,Origin,OriginWac,Dest,DestWac,Miles,ContiguousUSA,NumTicketsOrdered,AirlineCompany,PricePerTicket
0,0,20181767585,2018176758501,1,1,PHL,23,LAX,91,2402.0,2,1.0,AA,672.87


In [129]:
# Trails
trails = trails
trails.head(1)

Unnamed: 0,trail_id,name,area_name,city_name,state_name,country_name,popularity,length,elevation_gain,difficulty_rating,...,visitor_usage,avg_rating,num_reviews,features,activities,units,Lat,Long,Nearest Airport,Distance to Airport
0,10020048,Harding Ice Field Trail,Kenai Fjords National Park,Seward,Alaska,United States,24.8931,15610.598,1161.8976,5,...,3.0,5.0,423,"['dogs-no', 'forest', 'river', 'views', 'water...","['birding', 'camping', 'hiking', 'nature-trips...",i,60.18852,-149.63156,ANC,69.171875


In [130]:
# Airports
airports = latlong_df
airports.head(1)

Unnamed: 0,IATA,LATITUDE,LONGITUDE
0,ABQ,35.040222,-106.609194
