# Compile EPA emissions data
Convert the data from hourly to monthly and export all years as a single file.

In [1]:
import pandas as pd
import numpy as np
import os
from os.path import join
from joblib import Parallel, delayed
import sys

cwd = os.getcwd()
data_path = join(cwd, '..', 'Data storage')

In [2]:
%load_ext watermark
%watermark -iv -v

pandas      0.22.0
numpy       1.14.1
CPython 3.6.3
IPython 6.2.1


In [3]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

# add the 'src' directory as one where we can import modules
src_dir = join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [4]:
%aimport Data.data_extraction
from Data.data_extraction import import_group_epa, unit_conversion

%aimport Analysis.index
from Analysis.index import add_datetime, add_quarter

## Change years if necessary

In [7]:
start_year = 2001
end_year = 2017

if __name__ == '__main__':
    base_path = join(data_path, 'EPA emissions')
    paths = [join(base_path, 'EPA emissions {}.csv'.format(str(year)))
             for year in range(start_year, end_year + 1)]
    
    df_list = Parallel(n_jobs=-1)(delayed(import_group_epa)(path) 
                                  for path in paths)

In [8]:
df = pd.concat(df_list)

In [9]:
df.head()

Unnamed: 0,CO2_MASS (kg),GLOAD (MW),HEAT_INPUT (mmBtu),MONTH,OP_TIME,ORISPL_CODE,SLOAD (1000lb/hr),YEAR
0,962541100.0,1167596.0,11130000.0,1,4348.25,3,0.0,2001
1,820849500.0,880517.0,8951275.0,2,3391.0,3,0.0,2001
2,640292800.0,735523.0,7175850.0,3,3846.5,3,0.0,2001
3,741787900.0,898023.0,8519262.0,4,4292.75,3,0.0,2001
4,976724300.0,1230441.0,11386630.0,5,5029.75,3,0.0,2001


In [10]:
df.tail()

Unnamed: 0,CO2_MASS (kg),GLOAD (MW),HEAT_INPUT (mmBtu),MONTH,OP_TIME,ORISPL_CODE,SLOAD (1000lb/hr),YEAR
16603,0.0,0.0,337871.418,8,1494.69,880107,200653.0,2017
16604,0.0,0.0,308866.407,9,1447.3,880107,207589.0,2017
16605,0.0,0.0,327482.62,10,1488.72,880107,216266.0,2017
16606,0.0,0.0,341459.3,11,1440.0,880107,231679.0,2017
16607,0.0,0.0,360386.7,12,1488.0,880107,243984.0,2017


In [11]:
path = os.path.join(data_path, 'Derived data', 'Monthly EPA emissions 2018-03-06.csv')
df.to_csv(path, index=False)