In [246]:
# load all modules
import numpy as np
import pandas as pd
import glob as glob
try: #spelling depends on enviroment version 
    import urllib2 as urllib #URL handling module
except ImportError:
    import urllib.request as urllib
import json

In [268]:
# find all csv files
all_trips = glob.glob("./data/raw/DC/*.csv")
all_trips.sort()

In [269]:
# create a merged list from all csv files
washingtonDC_trips = []

for trips_file in all_trips:
    
    # read in trips data
    trips_df = pd.read_csv(trips_file, index_col=None, header=0, skipinitialspace=True, usecols=['start_lat', 'start_lng', 'end_lng', 'end_lat', 'start_station_id',
    'end_station_id'])
    
    # append it to the results list
    washingtonDC_trips.append(trips_df)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [271]:
# create a merged df from all trips
washingtonDC_trips_df = pd.concat(washingtonDC_trips, axis=0, ignore_index=True) 

In [272]:
# dimensions of dataframe
washingtonDC_trips_df.shape 

(1418520, 6)

In [273]:
# find null values
washingtonDC_trips_df.isna().sum() 

start_station_id    102989
end_station_id      116142
start_lat               10
start_lng               10
end_lat               4728
end_lng               4728
dtype: int64

In [274]:
# drop null values
washingtonDC_trips_df.dropna(inplace=True)
washingtonDC_trips_df.shape

(1265145, 6)

In [275]:
# latitude and longitude co-ordinate limits for Washington DC
lat_min = 38.806229986847676,
lat_max = 38.999466837911626,
lng_min = -77.12409668146095,
lng_max = -76.90252862586087,

In [276]:
#sanity check for values equal to zero
washingtonDC_trips_df = washingtonDC_trips_df[(washingtonDC_trips_df['start_lat'] >= lat_min) & (washingtonDC_trips_df['start_lat'] <= lat_max) & \
                      (washingtonDC_trips_df['start_lng'] >= lng_min) & (washingtonDC_trips_df['start_lng'] <= lng_max)]

In [277]:
# shape check after cleaning
washingtonDC_trips_df.shape 

(1232184, 6)

In [278]:
# change datatype of station id to integer
washingtonDC_trips_df.start_station_id = washingtonDC_trips_df.start_station_id.astype(int)

# remove duplicate stations
washingtonDC_stations_df = washingtonDC_trips_df.drop_duplicates(subset = ["start_station_id"])

In [279]:
# shape check after dropping duplicates
washingtonDC_stations_df.shape

(500, 6)

In [280]:
# sort dataframe and reset index
washingtonDC_stations_df = washingtonDC_stations_df.sort_values('start_station_id')
washingtonDC_stations_df.reset_index(drop=True, inplace=True)
washingtonDC_stations_df

Unnamed: 0,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng
0,31000,31064.0,38.858971,-77.053230,38.865029,-77.039511
1,31001,31064.0,38.857250,-77.053320,38.865029,-77.039511
2,31002,31062.0,38.856425,-77.049232,38.896553,-77.067140
3,31003,31266.0,38.860132,-77.049541,38.905460,-77.027346
4,31004,31084.0,38.857866,-77.059490,38.802677,-77.063562
...,...,...,...,...,...,...
495,32420,32422.0,38.965742,-76.954803,38.968842,-76.954171
496,32421,32408.0,38.962119,-76.935578,38.959361,-76.946361
497,32422,32413.0,38.968887,-76.954183,38.939256,-76.955022
498,32900,32900.0,38.964406,-77.010759,38.964406,-77.010759


In [281]:
'''
Input: the file that contain origin-destination lat/long information;
Output: list of elevation corresponding the input O-D pairs
'''

s_elv_list = []

for index, row in washingtonDC_stations_df.iterrows():
    start_lat = row['start_lat']
    start_lng = row['start_lng']
    end_lat = row['end_lat']
    end_lng = row['end_lng']
    
    url = str('https://router.hereapi.com/v8/routes?apiKey={}&origin={},{}&transportMode=car&destination={},{}&return=elevation'.format(apiKey, start_lat, start_lng, end_lat, end_lng ))
    data = urllib.urlopen(url).read().decode('utf-8')
    data = json.loads(data)
    
    # append elevation to 'start' list, nan if no elevation retrieved
    try:    
        s_elv = data['routes'][0]['sections'][0]['departure']['place']['location']['elv']
    except:
        s_elv = np.nan
        
    s_elv_list.append(s_elv)

In [282]:
# add start point elevation from list
washingtonDC_stations_df['start_station_elv'] = s_elv_list

In [283]:
# make changes to dataframe for csv export
washingtonDC_stations_df.drop(['end_station_id', 'end_lat', 'end_lng'], axis = 1, inplace=True)
washingtonDC_stations_df.rename(columns={'start_station_id': 'station_id', 'start_lat': 'station_latitude', \
                                        'start_lng': 'station_longitude', 'start_station_elv': 'station_elevation'}, inplace=True)
washingtonDC_stations_df

Unnamed: 0,station_id,station_latitude,station_longitude,station_elevation
0,31000,38.858971,-77.053230,-16.0
1,31001,38.857250,-77.053320,-16.0
2,31002,38.856425,-77.049232,-16.0
3,31003,38.860132,-77.049541,-17.0
4,31004,38.857866,-77.059490,-11.0
...,...,...,...,...
495,32420,38.965742,-76.954803,8.0
496,32421,38.962119,-76.935578,-5.0
497,32422,38.968887,-76.954183,5.0
498,32900,38.964406,-77.010759,42.0


In [284]:
# export to csv
washingtonDC_stations_df.to_csv('./data/exports/washington_dc_stations_elevation.csv', encoding='utf-8')