In [1]:
import dask.dataframe as dd
import multiprocessing as mp
from dask import distributed
import pandas as pd
import numpy as np
colNames = ['lon', 'lat', 'speed', 'uuid', 'roadtype', 'level', 'delay', 'length', 'epoch', 'datetime', 
        'CO2-Atm_base', 'CO2-Eq_base','CO_base','NOx_base','VOC_base','SO2_base','NH3_base','PM10_base','PM2.5_base',
        'CO2-Atm_ldv', 'CO2-Eq_ldv','CO_ldv','NOx_ldv','VOC_ldv','SO2_ldv','NH3_ldv','PM10_ldv','PM2.5_ldv',
        'CO2-Atm_bus', 'CO2-Eq_bus','CO_bus','NOx_bus','VOC_bus','SO2_bus','NH3_bus','PM10_bus','PM2.5_bus',
        'CO2-Atm_taxi', 'CO2-Eq_taxi','CO_taxi','NOx_taxi','VOC_taxi','SO2_taxi','NH3_taxi','PM10_taxi','PM2.5_taxi']
cluster = distributed.LocalCluster(n_workers=mp.cpu_count(), threads_per_worker = 1)

We run this after doing the speed curve creation -> pollutant calculation. The goal of this code is simply to create buckets for visualization.

In [2]:
df = dd.read_csv('./analysis_all_policies.csv', header=None, names=colNames)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/gaoag/Documents/Junior/urap-erg/all notebooks, start to end/analysis_all_policies.csv'

# From the dask dataframe, do vol scaling. next, do bucketing, groupby and agg to sum by bucket. finally, do percentage diffs on the buckets


## VOLUME SCALING

In the original pollutant analysis, we scaled the emissions of 1 car to the emissions of X cars (where X is the # of vehicles which fit on the line segment). However, of those X vehicles, we have to assume that some of them are no longer emitting, since we passed these electrification policies. The amount of vehicles effectively 'taken off the road' are calculated in https://paper.dropbox.com/doc/WAZE-API-5yK5F5OGXKAna1tGlJbYD under 'Vehicle Distribution Data'.



In [3]:
for col in colNames:
    if '_' not in col:
        continue
    elif col.split('_')[1] == 'ldv':
        df[col] = df[col]*0.4803
    elif col.split('_')[1] == 'bus':
        df[col] = df[col]*0.9919
    elif col.split('_')[1] == 'taxi':
        df[col] = df[col]*0.9636

In [4]:
#bucketing, 500x500. origin is the top left of the map
#these boundaries are the max/min lat/lon in the entire dataset.
#max_lat = 20.108879999999999 
#origin_lat = 18.807134
#max_lon = -98.195071
#origin_lon = -99.662641

#these boundaries are the max/min lat/lon for mexico city only.
max_lat = 19.595892
origin_lat = 19.220471
max_lon = -98.828109
origin_lon = -99.438230

In [7]:
#first, we cut out all the entries in our dataframe which don't fit the desired latitude/longitude.
df = df[(df['lat'] < max_lat) & (df['lat'] > origin_lat) & (df['lon'] < max_lon) & (df['lon'] > origin_lon)]

#next, we assign each row an x_bucket or y_bucket based upon its latitude and longitude.
lon_step_size = abs(max_lon - origin_lon)/500
lat_step_size = abs(max_lat - origin_lat)/500
df['x_bucket'] = df['lat'].apply(lambda x: abs(x - origin_lat)//lat_step_size)
df['y_bucket'] = df['lon'].apply(lambda y: abs(y - origin_lon)//lon_step_size)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


now, we group by and sum across buckets. What this does: there will be many rows which fit within each bucket. For example, the bucket (0, 0) may have 6 data points that fit into that bucket. This sums those datapoints together to come up with a cumulative amount of emissions within that bucket.

In [6]:

cols_to_agg = ['CO2-Atm_base', 'CO2-Eq_base','CO_base','NOx_base','VOC_base','SO2_base','NH3_base','PM10_base','PM2.5_base',
        'CO2-Atm_ldv', 'CO2-Eq_ldv','CO_ldv','NOx_ldv','VOC_ldv','SO2_ldv','NH3_ldv','PM10_ldv','PM2.5_ldv',
        'CO2-Atm_bus', 'CO2-Eq_bus','CO_bus','NOx_bus','VOC_bus','SO2_bus','NH3_bus','PM10_bus','PM2.5_bus',
        'CO2-Atm_taxi', 'CO2-Eq_taxi','CO_taxi','NOx_taxi','VOC_taxi','SO2_taxi','NH3_taxi','PM10_taxi','PM2.5_taxi']

grouped = df.groupby(['x_bucket', 'y_bucket'])[cols_to_agg].agg('sum')

Now, our dataframe has one row for each of the 500x500 = 250,000 buckets. In each 'bucket', designated by a unique x and y coordinate, there is a value for the total amount of emissions within that bucket, across all the different pollutant types and policy scenarios. What we care about is percentage differences between the policies and the baseline scenario. Thus, we add some columns that reflect percentage differences.

In [7]:
for pollutant in ['CO2-Atm', 'CO2-Eq', 'CO', 'NOx', 'VOC', 'SO2', 'NH3', 'PM10', 'PM2.5']:
    grouped['ldv_diff_' + pollutant] = (grouped[pollutant+'_ldv'] - grouped[pollutant+'_base'])/grouped[pollutant+'_base']
    grouped['bus_diff_' + pollutant] = (grouped[pollutant+'_bus'] - grouped[pollutant+'_base'])/grouped[pollutant+'_base']
    grouped['taxi_diff_' + pollutant] = (grouped[pollutant+'_taxi'] - grouped[pollutant+'_base'])/grouped[pollutant+'_base']

Now, we just write out to files.

In [8]:
tempdf = grouped[['bus_diff_CO2-Atm', 'bus_diff_CO2-Eq', 'bus_diff_CO', 'bus_diff_NOx', 'bus_diff_VOC', 'bus_diff_SO2', 'bus_diff_NH3', 'bus_diff_PM10', 'bus_diff_PM2.5','CO2-Atm_bus', 'CO2-Eq_bus','CO_bus','NOx_bus','VOC_bus','SO2_bus','NH3_bus','PM10_bus','PM2.5_bus']]
tempdf.to_csv('./emissions_diffs/bus/bus_mxcity1-*.csv')

['./emissions_diffs/bus/bus-0.csv']

In [9]:
tempdf = grouped[['ldv_diff_CO2-Atm', 'ldv_diff_CO2-Eq', 'ldv_diff_CO', 'ldv_diff_NOx', 'ldv_diff_VOC', 'ldv_diff_SO2', 'ldv_diff_NH3', 'ldv_diff_PM10', 'ldv_diff_PM2.5']]
tempdf.to_csv('./emissions_diffs/ldv/ldv_mxcity1-*.csv')

['./emissions_diffs/ldv/ldv-0.csv']

In [10]:
tempdf = grouped[['taxi_diff_CO2-Atm', 'taxi_diff_CO2-Eq', 'taxi_diff_CO', 'taxi_diff_NOx', 'taxi_diff_VOC', 'taxi_diff_SO2', 'taxi_diff_NH3', 'taxi_diff_PM10', 'taxi_diff_PM2.5']]
tempdf.to_csv('./emissions_diffs/taxi/taxi_mxcity1-*.csv')

['./emissions_diffs/taxi/taxi-0.csv']