<a href="https://colab.research.google.com/github/chloebs4590/Metis-Engineering/blob/main/train_emissions_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
reset -fs

In [2]:
import pandas as pd
import os
import pickle
import json

In [3]:
# mount Google Drive
from google.colab import drive # import drive from google colab
from os.path import join
ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Mounted at /content/drive


In [4]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

In [5]:
os.getcwd()

'/content'

In [6]:
os.chdir('/content/drive/MyDrive/Data Science Metis/Engineering')

Read in and prepare stations locations data for merging

In [7]:
# read in stations_locations_geocoded

with open('stations_locations_geocoded.pkl','rb') as fid:
  stations_locations = pickle.load(fid)

In [8]:
stations_locations.head(2)

Unnamed: 0,x,y,objectid_1,objectid,station_descripton,bus_or_train,zip_code,state,city,address_2,address_1,name,code,station_name,state_rev,full_address,longitude,latitude
0,-8478455.55,4794767.297,4,4,Station Building (with waiting room),TRAIN,21001,MD,Aberdeen,,18 East Bel Air Avenue,,ABE,"Aberdeen, MD",Maryland,"18 East Bel Air Avenue, Aberdeen, Maryland 21001",-76.163203,39.508412
1,-8208827.1,5257492.746,14,16,Station Building (with waiting room),TRAIN,12144,NY,Rensselaer,,525 East Street,,ALB,"Albany-Rensselaer, NY",New York,"525 East Street, Rensselaer, New York 12144",-73.741149,42.641353


In [9]:
# create combined city_state column

stations_locations['city_state'] = stations_locations['city'] + ',' + " " + stations_locations['state_rev']

In [10]:
stations_locations.columns

Index(['x', 'y', 'objectid_1', 'objectid', 'station_descripton',
       'bus_or_train', 'zip_code', 'state', 'city', 'address_2', 'address_1',
       'name', 'code', 'station_name', 'state_rev', 'full_address',
       'longitude', 'latitude', 'city_state'],
      dtype='object')

In [11]:
# keep only columns needed for merge

stations_locations = stations_locations[['code','city_state','latitude','longitude']]

In [12]:
stations_locations.shape

(540, 4)

Read in train stations combos data

In [13]:
with open('train_stations_combos_df_gmaps.pkl','rb') as fid:
  stations_combos_df = pickle.load(fid)

In [14]:
stations_combos_df.head(2)

Unnamed: 0,station_1_code,station_1_name,station_1_coords,station_2_code,station_2_name,station_2_coords,distance_mi,route
0,BOS,"Boston, MA","(42.348695, -71.059861)",BOS,"Boston, MA","(42.348695, -71.059861)",0.0,Acela
1,BOS,"Boston, MA","(42.348695, -71.059861)",RTE,"Route 128, MA","(42.2111905, -71.148665)",17.250466,Acela


Read in routes locations and clean it up data

In [15]:
with open('routes_locations_df.pkl','rb') as fid:
  routes_locations_df = pickle.load(fid)

In [16]:
# convert lists of locations to strings

list_to_string = lambda x: "; ".join(x)

routes_locations_df['locations_per_route_str'] = routes_locations_df['locations_per_route'].map(list_to_string)

In [17]:
# create new column that combines route and its locations

routes_locations_df['route_locations'] = routes_locations_df['route'] + ': ' + routes_locations_df['locations_per_route_str']

In [18]:
# drop columns locations_per_route and locations_per_route_str

routes_locations_df = routes_locations_df.drop(columns=['locations_per_route','locations_per_route_str'],axis=1)

In [19]:
# add a color column

routes_locations_df['color'] = '#ed1c24'

In [20]:
routes_locations_df.tail()

Unnamed: 0,route,coordinates_rev,route_locations,color
37,Sunset Limited,"[[-90.0795131, 29.9465427], [-90.8133297, 29.7...","Sunset Limited: New Orleans, LA; Schriever, LA...",#ed1c24
38,Texas Eagle,"[[-87.6403072, 41.8786625], [-88.0788927, 41.5...","Texas Eagle: Chicago, IL; Joliet, IL; Pontiac,...",#ed1c24
39,Valley Flyer,"[[-72.600656, 42.5858644], [-72.6274835, 42.31...","Valley Flyer: Greenfield, MA; Northampton, MA;...",#ed1c24
40,Vermonter,"[[-73.0860737, 44.8122767], [-73.1101592, 44.4...","Vermonter: St. Albans, VT; Essex Junction-Burl...",#ed1c24
41,Wolverine,"[[-87.6403072, 41.8786625], [-87.5066324, 41.6...","Wolverine: Chicago, IL; Hammond-Whiting, IN; M...",#ed1c24


Download csv of routes_locations_df for map in streamlit app

In [None]:
# routes_locations_df.to_csv('routes_locations.csv',index=False)
# from google.colab import files
# files.download("routes_locations.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Read in pickled train emissions data

In [21]:
with open('climatiq_train_responses_dict_2_gmaps.pkl','rb') as fid:
  train_responses_dict = pickle.load(fid)

Convert train emissions data to dataframe and merge with station combos

In [22]:
train_responses_dict_vals = train_responses_dict.values()
climatiq_df = pd.json_normalize(train_responses_dict_vals)
stations_combos_c02 = pd.concat([stations_combos_df, climatiq_df], axis=1)
stations_combos_c02.shape

(842, 16)

Merge stations locations df with routes locations df

In [23]:
stations_combos_c02 = stations_combos_c02.merge(routes_locations_df, how='left', on='route')

Merge stations locations df with stations combos co2 df to pull in location data for origin station

In [24]:
stations_combos_c02_locs = stations_combos_c02.merge(stations_locations, how='left', left_on='station_1_code', right_on='code')

In [25]:
stations_combos_c02_locs.head(2)

Unnamed: 0,station_1_code,station_1_name,station_1_coords,station_2_code,station_2_name,station_2_coords,distance_mi,route,co2e,co2e_unit,...,region,category,lca_activity,coordinates_rev,route_locations,color,code,city_state,latitude,longitude
0,BOS,"Boston, MA","(42.348695, -71.059861)",BOS,"Boston, MA","(42.348695, -71.059861)",0.0,Acela,0.0,kg,...,US,Rail Travel,unspecified,"[[-71.059861, 42.348695], [-71.148665, 42.2111...","Acela: Boston, MA; Route 128, MA; Providence, ...",#ed1c24,BOS,"Boston, Massachusetts",42.348695,-71.059861
1,BOS,"Boston, MA","(42.348695, -71.059861)",RTE,"Route 128, MA","(42.2111905, -71.148665)",17.250466,Acela,3.143035,kg,...,US,Rail Travel,unspecified,"[[-71.059861, 42.348695], [-71.148665, 42.2111...","Acela: Boston, MA; Route 128, MA; Providence, ...",#ed1c24,BOS,"Boston, Massachusetts",42.348695,-71.059861


In [26]:
# rename city_state, latitude and longitude columns so it's clear they correspond to the the origin station

stations_combos_c02_locs.rename(columns={'city_state':'origin_location',
                                         'latitude':'origin_lat',
                                         'longitude':'origin_lon'},inplace=True)

In [27]:
# drop columns not needed
stations_combos_c02_locs = stations_combos_c02_locs.drop(columns=['co2e_unit','id','source','year','region','category',
                                                                  'lca_activity','code'])

Merge stations locations df with stations combos co2 df to pull in location data for destination station

In [28]:
stations_combos_c02_locs = stations_combos_c02_locs.merge(stations_locations, how='left', left_on='station_2_code', right_on='code')

In [29]:
stations_combos_c02_locs.head(2)

Unnamed: 0,station_1_code,station_1_name,station_1_coords,station_2_code,station_2_name,station_2_coords,distance_mi,route,co2e,coordinates_rev,route_locations,color,origin_location,origin_lat,origin_lon,code,city_state,latitude,longitude
0,BOS,"Boston, MA","(42.348695, -71.059861)",BOS,"Boston, MA","(42.348695, -71.059861)",0.0,Acela,0.0,"[[-71.059861, 42.348695], [-71.148665, 42.2111...","Acela: Boston, MA; Route 128, MA; Providence, ...",#ed1c24,"Boston, Massachusetts",42.348695,-71.059861,BOS,"Boston, Massachusetts",42.348695,-71.059861
1,BOS,"Boston, MA","(42.348695, -71.059861)",RTE,"Route 128, MA","(42.2111905, -71.148665)",17.250466,Acela,3.143035,"[[-71.059861, 42.348695], [-71.148665, 42.2111...","Acela: Boston, MA; Route 128, MA; Providence, ...",#ed1c24,"Boston, Massachusetts",42.348695,-71.059861,RTE,"Westwood, Massachusetts",42.211191,-71.148665


In [30]:
# rename, drop and add columns

stations_combos_c02_locs.columns = ['origin_code','origin_name','origin_coords','dest_code','dest_name','dest_coords','distance_mi',
                                    'route','co2e_kg','coordinates_rev','route_locations','color','origin_location','origin_lat', 'origin_lon','code',
                                    'dest_location','latitude','longitude']
stations_combos_c02_locs = stations_combos_c02_locs.drop(columns=['code','latitude','longitude','color','coordinates_rev'],axis=1)                       
stations_combos_c02_locs['co2e_kg_round'] = stations_combos_c02_locs['co2e_kg'].map(lambda x: int(x))
stations_combos_c02_locs['co2e_lb'] = stations_combos_c02_locs['co2e_kg'].map(lambda x: int(x*2.2))

In [31]:
stations_combos_c02_locs.head(2)

Unnamed: 0,origin_code,origin_name,origin_coords,dest_code,dest_name,dest_coords,distance_mi,route,co2e_kg,route_locations,origin_location,origin_lat,origin_lon,dest_location,co2e_kg_round,co2e_lb
0,BOS,"Boston, MA","(42.348695, -71.059861)",BOS,"Boston, MA","(42.348695, -71.059861)",0.0,Acela,0.0,"Acela: Boston, MA; Route 128, MA; Providence, ...","Boston, Massachusetts",42.348695,-71.059861,"Boston, Massachusetts",0,0
1,BOS,"Boston, MA","(42.348695, -71.059861)",RTE,"Route 128, MA","(42.2111905, -71.148665)",17.250466,Acela,3.143035,"Acela: Boston, MA; Route 128, MA; Providence, ...","Boston, Massachusetts",42.348695,-71.059861,"Westwood, Massachusetts",3,6


In [32]:
# reorder columns

stations_combos_c02_locs = stations_combos_c02_locs[['origin_code','origin_name','origin_coords','origin_location','origin_lat','origin_lon','dest_code','dest_name',
                                         'dest_coords','dest_location','route','route_locations','distance_mi','co2e_kg','co2e_kg_round','co2e_lb']]

In [33]:
stations_combos_c02_locs.head(2)

Unnamed: 0,origin_code,origin_name,origin_coords,origin_location,origin_lat,origin_lon,dest_code,dest_name,dest_coords,dest_location,route,route_locations,distance_mi,co2e_kg,co2e_kg_round,co2e_lb
0,BOS,"Boston, MA","(42.348695, -71.059861)","Boston, Massachusetts",42.348695,-71.059861,BOS,"Boston, MA","(42.348695, -71.059861)","Boston, Massachusetts",Acela,"Acela: Boston, MA; Route 128, MA; Providence, ...",0.0,0.0,0,0
1,BOS,"Boston, MA","(42.348695, -71.059861)","Boston, Massachusetts",42.348695,-71.059861,RTE,"Route 128, MA","(42.2111905, -71.148665)","Westwood, Massachusetts",Acela,"Acela: Boston, MA; Route 128, MA; Providence, ...",17.250466,3.143035,3,6


In [34]:
stations_combos_c02_locs.shape

(842, 16)

In [None]:
# from google.colab import files
# stations_combos_c02_locs.to_csv('trains_emissions.csv', encoding = 'utf-8-sig', index=False) 
# files.download('trains_emissions_42.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>