In [7]:
import pandas as pd
import requests

# Emissions Data Pull

The following Jupyter notebook will pull emissions data from the Environmental Protection Agency (EPA) website. This code will call the EPA API, and will return nationwide carbon monoxide levels. The API only allows the pull of one year of data at a time so we will consolidate at the end. 

**Disclaimer: This code takes a while to pull data(~2 hrs) from 2017 as the API only allows one request at a time, and no asynchronous calls. The weekly update airflow script runs much faster. This is a one time pull to our database, and then there are weekly updates in airflow. 

https://aqs.epa.gov/aqsweb/documents/data_api.html

To register for the API:
https://aqs.epa.gov/data/api/signup?email=myemail@example.com

In [8]:
def get_parameter_details():
    """This function fetches the different EPA measurement parameters and their corresponding codes for use later
    
    Returns:
        list: A list of tuples containing the parameter code and the parameter name
    """
    email = "test@aqs.api"
    key = "test"
    pc = "CRITERIA"
    endpoint = "https://aqs.epa.gov/data/api/list/parametersByClass"

    params = {
        "email": email,
        "key": key,
        "pc": pc
    }

    # Check if the response was successful, if so return the data, if not display status code
    response = requests.get(endpoint, params=params)
    if response.status_code == 200:
        data = response.json()['Data']
        # Extracting the parameter name using 'value_represented'
        return [(item['code'], item.get('value_represented', None)) for item in data]
    else:
        print(f"Failed to fetch parameter details. Status code: {response.status_code}")
        return []

parameter_details = get_parameter_details()
print(parameter_details)

[('14129', 'Lead (TSP) LC'), ('42101', 'Carbon monoxide'), ('42401', 'Sulfur dioxide'), ('42602', 'Nitrogen dioxide (NO2)'), ('44201', 'Ozone'), ('81102', 'PM10 Total 0-10um STP'), ('85129', 'Lead PM10 LC FRM/FEM'), ('88101', 'PM2.5 - Local Conditions')]


In [9]:
# Let's define a function to fetch data for a given state, date range, and parameter
def fetch_data_for_state(state, param, bdate, edate):
    """A function to fetch data for a given state, date range, and parameter
    Args:
        state ([String]): A list of designated state codes
        param (String): The parameter code to fetch data for (Carbon Monoxide, Lead, Ozone, etc.)
        bdate (String): The start date to fetch data for
        edate (String): The end date to fetch data for
    Returns:
        json: The data returned by the API in json format
    """
    # Define access parameters
    base_url = "https://aqs.epa.gov/data/api/dailyData/byState"
    email = "brandonmorrow1010@gmail.com"  
    api_key = "taupehawk58"   # Please dont steal my api key and put me on a watch list

    # Define the API request parameters
    params = {
        "email": email,
        "key": api_key,
        "param": param,
        "bdate": bdate,
        "edate": edate,
        "state": state
    }
    # Fetch the data
    response = requests.get(base_url, params=params)
    
    # Check if the response was successful, if so return the data, if not display status code
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed for state {state}. Status code: {response.status_code}")
        print(response.text)  # Print the actual content for debugging
        return None

We have everything so lastly we need to clean this up. We want to only keep certain features, so lets define the features and create a cleaning function to clean our data. 

In [10]:
def clean_emissions_data(df, selected_columns):
    """Takes in an emissions dataframe and converts it to weekly data
    Args:
        df (DataFrame): Takes in an emissions dataframe
        selected_columns (list): A list of columns to keep
    Returns:
        DataFrame: Returns a weekly emissions dataframe with filtered columns
    """    
    # Convert the "date_local" column to datetime format
    df['date_local'] = pd.to_datetime(df['date_local'], format="%Y-%m-%d")

    # Group the data by "date_local" and calculate daily averages for selected columns
    grouped = df.groupby(['date_local'])[selected_columns].mean(numeric_only=True)
    daily_aggregated_df = pd.DataFrame(grouped).round(4)

    # Disaggregate to week level
    weekly_epa = daily_aggregated_df.resample('W').ffill()

    # Reset the index to make "date_local" a regular column
    weekly_epa.reset_index(inplace=True)
    return weekly_epa

Now we have our cleaning function lets get data for all years from "2017" to now.  This takes a while as the API is a free government API which does not allow synchornous calls so the calls must be made one after the other. We can only get data by state by day. So to get the national average we must take all of this data and then clean it with our above cleaning function. We also define the selected columns that we want for analysis. We will then save this data into two files. One file for the total data, and one file for the processed weekly data. 

In [11]:
# Set Parameters
param = "42101"  # Carbon Monoxide
selected_columns = [
    'date_local', 'parameter', 'aqi', 'arithmetic_mean', 'first_max_value', 'observation_count', 
    'observation_percent'    
    ]
state_codes = [
    '01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
    '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
    '21', '22', '23', '24', '25', '26', '27', '28', '29', '30',
    '31', '32', '33', '34', '35', '36', '37', '38', '39', '40',
    '41', '42', '43', '44', '45', '46', '47', '48', '49', '50'
] # All states

# Get years 2017-2023
years = ["2017", "2018", "2019", "2020", "2021", "2022", "2023"]

weekly_emissions_all_years = pd.DataFrame()
daily_emissions_all_years = pd.DataFrame()
for year in years:
    bdate = year + "0101"
    edate = year + "1231"

    all_states_data = []
    for state_code in state_codes:
        data = fetch_data_for_state(state_code, param, bdate, edate)
        print(f"Fetching data for state {state_code} for year {year}")
        # Check if data is not null and contains key 'Data', if so then add to list of data
        if data and 'Data' in data: 
            all_states_data.extend(data['Data'])
        
    df = pd.DataFrame(all_states_data) # Convert to pandas DataFrame
    daily_emissions_all_years = pd.concat([daily_emissions_all_years, df], ignore_index=True) # Add to the main dataframe
    print(f"Finished data for year {year}")

Fetching data for state 01 for year 2017
Fetching data for state 02 for year 2017
Fetching data for state 03 for year 2017
Fetching data for state 04 for year 2017
Fetching data for state 05 for year 2017
Fetching data for state 06 for year 2017
Fetching data for state 07 for year 2017
Fetching data for state 08 for year 2017
Fetching data for state 09 for year 2017
Fetching data for state 10 for year 2017
Fetching data for state 11 for year 2017
Fetching data for state 12 for year 2017
Fetching data for state 13 for year 2017
Fetching data for state 14 for year 2017
Fetching data for state 15 for year 2017
Fetching data for state 16 for year 2017
Fetching data for state 17 for year 2017
Fetching data for state 18 for year 2017
Fetching data for state 19 for year 2017
Fetching data for state 20 for year 2017
Fetching data for state 21 for year 2017
Fetching data for state 22 for year 2017
Fetching data for state 23 for year 2017
Fetching data for state 24 for year 2017
Fetching data fo

In [12]:
# Clean the data
weekly_emissions_all_years = clean_emissions_data(daily_emissions_all_years, selected_columns)

Looks good.. lets check what features we got from our request:

In [13]:
weekly_emissions_all_years.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358 entries, 0 to 357
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date_local           358 non-null    datetime64[ns]
 1   aqi                  358 non-null    float64       
 2   arithmetic_mean      358 non-null    float64       
 3   first_max_value      358 non-null    float64       
 4   observation_count    358 non-null    float64       
 5   observation_percent  358 non-null    float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 16.9 KB


In [14]:
weekly_emissions_all_years.head(10)

Unnamed: 0,date_local,aqi,arithmetic_mean,first_max_value,observation_count,observation_percent
0,2017-01-01,5.2537,0.3205,0.549,21.4296,87.2596
1,2017-01-08,5.3433,0.3353,0.532,24.0056,98.0167
2,2017-01-15,5.6162,0.3451,0.5615,24.0184,98.0976
3,2017-01-22,5.0476,0.2946,0.4908,24.0804,98.3656
4,2017-01-29,5.4081,0.3288,0.5538,24.0697,98.3138
5,2017-02-05,4.9927,0.3165,0.4877,24.1107,98.5082
6,2017-02-12,5.0849,0.2977,0.4856,24.0276,98.136
7,2017-02-19,4.7815,0.2906,0.4717,24.122,98.5176
8,2017-02-26,3.8364,0.2428,0.386,24.1132,98.475
9,2017-03-05,3.7852,0.2457,0.3722,24.1534,98.647


In [15]:
# Export final df to csv
weekly_emissions_all_years.to_csv("emissions_data_2017_2023.csv", index=False)

In [16]:
daily_emissions_all_years.to_csv("detailed_emissions_data.csv", index=False)