<a href="https://colab.research.google.com/github/chloebs4590/Metis-Engineering/blob/main/car_emissions_data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description of Notebook
This notebook is for car emissions data collection

In [None]:
reset -fs

In [None]:
!pip install googlemaps

In [None]:
import pandas as pd
import numpy as np
import os
from itertools import combinations
import itertools
import time
import pickle
import requests
import googlemaps

In [None]:
# mount Google Drive
from google.colab import drive # import drive from google colab
from os.path import join
ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

In [None]:
os.chdir('/content/drive/MyDrive/Data Science Metis/Engineering')

Read in data

In [None]:
worksheet = gc.open('Amtrak Routes and Train Stations_rev').sheet1
rows = worksheet.get_all_values()
routes_stations = pd.DataFrame.from_records(rows[1:])

In [None]:
routes_stations.columns = ['route','station_name']

In [None]:
# read in stations_locations_geocoded

with open('stations_locations_geocoded.pkl','rb') as fid:
  stations_locations = pickle.load(fid)

In [None]:
# convert latitude and longitude columns to numeric

stations_locations[['latitude', 'longitude']] = stations_locations[['latitude', 'longitude']].apply(pd.to_numeric)

In [None]:
# merge latitude and longitude columns

stations_locations['coordinates'] = stations_locations[['latitude', 'longitude']].apply(tuple, axis=1)

In [None]:
# keep only necessary columns

stations_locations = stations_locations.drop(columns=['longitude', 'latitude','objectid_1','objectid','station_descripton',
                                                      'bus_or_train','address_2','name'])

In [None]:
stations_locations = stations_locations.drop(columns=['x', 'y'])

In [None]:
stations_locations.shape

(540, 9)

In [None]:
stations_locations.head()

Unnamed: 0,zip_code,state,city,address_1,code,station_name,state_rev,full_address,coordinates
0,21001,MD,Aberdeen,18 East Bel Air Avenue,ABE,"Aberdeen, MD",Maryland,"18 East Bel Air Avenue, Aberdeen, Maryland 21001","(39.5084117, -76.163203)"
1,12144,NY,Rensselaer,525 East Street,ALB,"Albany-Rensselaer, NY",New York,"525 East Street, Rensselaer, New York 12144","(42.6413525, -73.7411492)"
2,97321,OR,Albany,110 10th Avenue SW,ALY,"Albany, OR",Oregon,"110 10th Avenue SW, Albany, Oregon 97321","(44.6301042, -123.1029538)"
3,49224,MI,Albion,300 North Eaton Street,ALI,"Albion, MI",Michigan,"300 North Eaton Street, Albion, Michigan 49224","(42.2472581, -84.7556711)"
4,87102,NM,Albuquerque,320 1st Street SW,ABQ,"Albuquerque, NM",New Mexico,"320 1st Street SW, Albuquerque, New Mexico 87102","(35.0819655, -106.6476898)"


Clean up routes_stations for merging with stations_locations

In [None]:
# create code column

def create_code_col(x):
  code = x[-5:]
  code = code[1:-1]
  return code

routes_stations['code'] = routes_stations['station_name'].map(create_code_col)

In [None]:
# remove code from station_name column

routes_stations['station_name'] = routes_stations['station_name'].map(lambda x: x[:-6])

Left merge stations_locations on routes_stations

In [None]:
routes_stations_locations = routes_stations.merge(stations_locations, how='left', on='code',suffixes=('_route', '_location'))

In [None]:
routes_stations_locations.shape

(819, 11)

In [None]:
# since I only really care about stations at the city level, I'll remove duplicate cities per route

print(sum(routes_stations_locations.duplicated(subset=['station_name_location', 'route'])))

routes_stations_locations = routes_stations_locations.drop_duplicates(subset=['station_name_location', 'route'])

19


In [None]:
# get number of routes
routes_stations_locations.route.nunique()

42

In [None]:
# create list of all route names
routes = list(routes_stations_locations.route.unique())
len(routes)

42

Create a dictionary where key = route and values = tuples of all combinations of two stations within route

In [None]:
route_combos_dict = {}

for route in routes:
  route_df = routes_stations_locations.loc[routes_stations_locations.route == route]
  route_combos = list(combinations(route_df.code, 2))
  route_combos_dict[route] = route_combos

Calculate distances using Google's Distance Matrix API. The below code comes from here: https://www.linkedin.com/pulse/calculating-distances-using-python-google-maps-r%C3%A9gis-nisengwe/?articleId=6625061973447053312

In [None]:
#create a list of lists, in which each nested list contains each station combo, including the station names, codes, coordinates and haversine distance

API_key = 'my_key'
gmaps = googlemaps.Client(key=API_key)

routes_combos_list = []

for key,value in route_combos_dict.items():
    for i in range(len(value)):
      station_1_coords = routes_stations_locations.loc[routes_stations_locations.code == value[i][0]]['coordinates'].values[0]
      station_2_coords = routes_stations_locations.loc[routes_stations_locations.code == value[i][1]]['coordinates'].values[0]

      station_1_name = routes_stations_locations.loc[routes_stations_locations.code == value[i][0]]['station_name_location'].values[0]
      station_2_name = routes_stations_locations.loc[routes_stations_locations.code == value[i][1]]['station_name_location'].values[0]

      distance = gmaps.distance_matrix(station_1_coords, station_2_coords,mode='driving')["rows"][0]["elements"][0]["distance"]["value"]
      routes_combos_list.append([value[i][0], station_1_name, station_1_coords, value[i][1], station_2_name, station_2_coords, distance])

In [None]:
# create dataframe

stations_combos_df = pd.DataFrame(routes_combos_list, columns = ['station_1_code','station_1_name','station_1_coords','station_2_code','station_2_name',
                                                 'station_2_coords', 'distance_meters'])
stations_combos_df['distance_mi'] = stations_combos_df['distance_meters'] / 1609

In [None]:
# drop meters column
stations_combos_df = stations_combos_df.drop(columns='distance_meters',axis=1)

In [None]:
stations_combos_df.head()

Unnamed: 0,station_1_code,station_1_name,station_1_coords,station_2_code,station_2_name,station_2_coords,distance_mi
0,BOS,"Boston, MA","(42.348695, -71.059861)",RTE,"Route 128, MA","(42.2111905, -71.148665)",17.250466
1,BOS,"Boston, MA","(42.348695, -71.059861)",PVD,"Providence, RI","(41.8305099, -71.4131785)",48.474208
2,BOS,"Boston, MA","(42.348695, -71.059861)",NHV,"New Haven, CT","(41.2973604, -72.9267668)",137.650093
3,BOS,"Boston, MA","(42.348695, -71.059861)",STM,"Stamford, CT","(41.0468938, -73.5429146)",173.993163
4,BOS,"Boston, MA","(42.348695, -71.059861)",NYP,"New York, NY","(40.7509973, -73.9962784)",212.117464


In [None]:
# add a column to the dataframe containing the route corresponding to each stations pair

route_combos_dict_keys = list(route_combos_dict.keys())
route_combos_dict_keys_lengths = [len(v) for v in route_combos_dict.values()]

# the below code came from here: https://stackoverflow.com/questions/48837245/how-to-multiply-a-list-of-strings-by-a-list-of-integers
routes_column = sum([[s] * n for s, n in zip(route_combos_dict_keys, route_combos_dict_keys_lengths)], [])

stations_combos_df['route'] = routes_column

In [None]:
stations_combos_df.shape

(10030, 8)

In [None]:
# pickle stations_combos_df
with open('train_stations_combos_df_cars_gmaps.pkl', 'wb') as fid:
     pickle.dump(stations_combos_df, fid)

Make all requests to API

In [None]:
# read in stations_combos_df

with open('train_stations_combos_df_cars_gmaps.pkl','rb') as fid:
  stations_combos_df = pickle.load(fid)

In [None]:
headers = {'Authorization': 'Bearer my_key','Content-type': 'application/json'}

In [None]:
distances = list(stations_combos_df.distance_mi)

In [None]:
len(distances)

10030

In [None]:
responses_dict = {}

for idx, distance in enumerate(distances):
  response_dict = requests.post('https://beta2.api.climatiq.io/estimate',
                           data=json.dumps({"emission_factor": "passenger_vehicle-vehicle_type_car-fuel_source_na-engine_size_na-vehicle_age_na-vehicle_weight_na",\
                               "parameters": {"passengers": 1,"distance": distance,"distance_unit": "mi"}}),
                           headers=headers).json()
  responses_dict[idx] = response_dict
  time.sleep(3)

# pickle responses_dict
with open('climatiq_car_responses_dict_gmaps.pkl', 'wb') as fid:
     pickle.dump(responses_dict, fid)