In [None]:
from pandas.io.json import json_normalize
import time
import os
from scipy import interpolate
import json
import numpy as np
import pandas as pd
import csv
import dask.dataframe as dd


In [None]:
#this function takes in a speed curve file (with speed and pollutants in the rows) and returns a set of 'functions', 
#which are interpolated with linear spline interpoluation using scipy's interp1d function. they do NOT extrapolate, only interpolate between 0-80 KPH
#functions take speed or a vector of speeds as an input to return a pollution amount. 
#can be arbitrarily interchanged w any pollutant/speed function.
def emissionsFunctions(file):

    pollutantFile = open(file, mode='r')
    pollutantReader = csv.DictReader(pollutantFile)
    interpolationData = {'speed': [], 'CO2-Atm':[], 'CO2-Eq':[], 'CO':[], 'NOx':[], 'VOC':[], 'SO2':[], 'NH3':[], 'PM10':[], 'PM2.5':[]}

    for row in pollutantReader:
        for key, value in row.items():
            interpolationData[key].append(float(value))

    interpolationFunctions = {}
    interpolationFunctions['CO2-Atm'] = interpolate.interp1d(interpolationData['speed'], interpolationData['CO2-Atm'])
    interpolationFunctions['CO2-Eq'] = interpolate.interp1d(interpolationData['speed'], interpolationData['CO2-Eq'])
    interpolationFunctions['CO'] = interpolate.interp1d(interpolationData['speed'], interpolationData['CO'])
    interpolationFunctions['NOx'] = interpolate.interp1d(interpolationData['speed'], interpolationData['NOx'])
    interpolationFunctions['VOC'] = interpolate.interp1d(interpolationData['speed'], interpolationData['VOC'])
    interpolationFunctions['SO2'] = interpolate.interp1d(interpolationData['speed'], interpolationData['SO2'])
    interpolationFunctions['NH3'] = interpolate.interp1d(interpolationData['speed'], interpolationData['NH3'])
    interpolationFunctions['PM10'] = interpolate.interp1d(interpolationData['speed'], interpolationData['PM10'])
    interpolationFunctions['PM2.5'] = interpolate.interp1d(interpolationData['speed'], interpolationData['PM2.5'])

    return interpolationFunctions

In [None]:
interpolationFunctions= {}
interpolationFunctions[2] = emissionsFunctions('./speed_curves/base_roadtype_2_curve.csv') #whatever file is used to store the speed curves
interpolationFunctions[3] = emissionsFunctions('./speed_curves/base_roadtype_3_curve.csv')
interpolationFunctions[4] = emissionsFunctions('./speed_curves/base_roadtype_4_curve.csv')
interpolationFunctions[5] = emissionsFunctions('./speed_curves/base_roadtype_5_curve.csv')
#set of functions that can be indexed by roadtype+pollutant. each pollutant and roadtype has its own function.

pollutantTypes = ['CO2-Atm', 'CO2-Eq', 'CO', 'NOx', 'VOC', 'SO2', 'NH3', 'PM10', 'PM2.5']

roadtypemapping = {1:5, 2:5, 3:4, 4:4, 5:3, 6: 5, 7: 5, 17: 4} #this maps the speed curve roadtype (MOVES roadtypes) to the roadtypes in the waze data. 
#mapping can be found at https://paper.dropbox.com/doc/WAZE-API-5yK5F5OGXKAna1tGlJbYD, scrolling down to the road mapping portion

# Calculating pollutants from raw Waze data and speed curves

Now, we try to apply our speed curves to the Waze data to get emissions output. While we used 100 cars to construct our speed curves, we need to estimate how many cars will fit on each segment given to us by Waze. To do so, we use this formula:

$$pollution = function(speed)/100 * (length/0.00160934) * (length*3/6) * 1000*speed/length$$, which outputs the emissions from that stretch of road over 1 hour. The explanation is below.

The interpolate function gives us the emissions for a particular speed, given the parameters shown above. We divide it by 100 because the original speed curves were constructed assuming 100 cars, but we want to treat this as only one car for now. I can later go back and remake the speed curves to be for 1 car only (this is temporary; we will have to remake the curves anyways, since the moves_mexico DB was not complete on my dropbox).

Then, scale by $length/0.00160934$. The original speed curves were done for a 1 mile road, but the waze data is in meters. since the emissions scale linearly with road length, we can just scale the emissions to a 1 meter road (the division by 0.016), and multiply by the length of the waze segment. Now, we have the emission for 1 car on the waze segment.

Now, we scale up from one car to however many cars fit on the segment. The $length*3/6$ term assumes that each car takes up 6 meters, and there are on average 3 lanes (3 ‘rows’ of cars). After multiplying by that term, we have the emissions for the total number of cars that fit on the road at once. 

However, the speed curves/MOVES return the emissions for every vehicle that goes through the ENTIRE stretch of road. Even if, say, 1000 cars fit on the segment given to us by Waze, not all of them may make it through the road. We need to finally multiply by $1000*speed/length$. The speed is Km/h, so we multiply by 1000 to get meters per hour. Then, we divide by length to get $1/(time it takes to go through the road)$. This scales the amount of cars by how long it actually takes to traverse the road. For example: if there are 10 cars on a 100 meter stretch of road, and it takes 0.5 hours to go through that road, then after 1 hour, we would actually have had 20 cars go through that road.

The result is the total amount of emissions from the line segment in 1 hour.

In [None]:
#adjust 'chunksize' based on what the computer can handle. It was 1200000 on pedro's computer.
addHeader = True
for chunk in pd.read_csv('input.csv', chunksize=100000, index_col=False, names=['lat', 'lon', 'speed', 'uuid', 'roadtype', 'level', 'delay', 'length', 'epoch', 'datetime']):
    chunk = chunk.drop_duplicates(['epoch', 'uuid'])
    chunk.loc[chunk['speed'] == 0, 'speed'] = 0.01
    grouped = chunk.groupby('roadtype') 
    
    for roadtype, group in grouped:
        speed = group['speed']
        length = group['length']
        for policy in ['base', 'ldv', 'bus', 'taxi']:            
            #select set of functions to use based on the policy and the EPA/MOVES roadtype.
            funcs = interpolationFunctions[policy][roadtypemapping[roadtype]]
            func_co2atm, func_co2eq, func_co, func_nox, func_voc, func_so2, func_nh3, func_pm10, func_pm25 = funcs['CO2-Atm'], funcs['CO2-Eq'], funcs['CO'], funcs['NOx'], funcs['VOC'], funcs['SO2'], funcs['NH3'], funcs['PM10'], funcs['PM2.5']
        
        
            #specific method explained in markdown cell above.
            group['CO2-Atm' + '_' + policy] = func_co2atm(speed)*length*speed*0.00310686368
            group['CO2-Eq' + '_' + policy] = func_co2eq(speed)*length*speed*0.00310686368
            group['CO' + '_' + policy] = func_co(speed)*length*speed*0.00310686368
            group['NOx' + '_' + policy] = func_nox(speed)*length*speed*0.00310686368
            group['VOC' + '_' + policy] = func_voc(speed)*length*speed*0.00310686368
            group['SO2' + '_' + policy] = func_so2(speed)*length*speed*0.00310686368
            group['NH3' + '_' + policy] = func_nh3(speed)*length*speed*0.00310686368
            group['PM10' + '_' + policy] = func_pm10(speed)*length*speed*0.00310686368
            group['PM2.5' + '_' + policy] = func_pm25(speed)*length*speed*0.00310686368
            
        #now, write to csv. The dataframe written includes all the original columns as well as columns for every policy+pollutant combo (9*4=36 new columns)   
        group.to_csv(path_or_buf='./output.csv', mode='a', index=False, header=addHeader)
        addHeader = False