In [1]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [3]:
# aqs_key.txt is text file with two lines.  The first line 
# is my aqs email and the second line has my aqs api key
with open('aqs_key.txt', 'r') as file:
    email = file.readline().strip()
    key = file.readline().strip()

In [6]:
list_url = 'https://aqs.epa.gov/data/api/list/'

## Get State FIPS codes

In [9]:
# STATE FIPS
url = list_url + 'states'
r = requests.get(url, params = {"email":email, "key":key})
r.status_code

200

In [21]:
states = pd.DataFrame(r.json()['Data'])
states[(states.value_represented=='Utah') | (states.value_represented=='Arizona')]
## Utah is 49
## Arizona is 04

Unnamed: 0,code,value_represented
2,4,Arizona
44,49,Utah


In [64]:
url = "https://aqs.epa.gov/data/api/list/cbsas"
params = {"email": email, "key": key}
r = requests.get(url, params=params)

In [65]:
r

<Response [200]>

## Get County FIPS

I am interested in Maricopa county in Arizona and Utah county in Utah

In [37]:
# County Codes for Arizona
url = list_url + 'countiesByState'
r = requests.get(url, params = {"email":email, "key":key, "state":"04"})
r.status_code

200

In [40]:
r.json()['Data']
## Maricopa county is 013

[{'code': '001', 'value_represented': 'Apache'},
 {'code': '003', 'value_represented': 'Cochise'},
 {'code': '005', 'value_represented': 'Coconino'},
 {'code': '007', 'value_represented': 'Gila'},
 {'code': '009', 'value_represented': 'Graham'},
 {'code': '011', 'value_represented': 'Greenlee'},
 {'code': '012', 'value_represented': 'La Paz'},
 {'code': '013', 'value_represented': 'Maricopa'},
 {'code': '015', 'value_represented': 'Mohave'},
 {'code': '017', 'value_represented': 'Navajo'},
 {'code': '019', 'value_represented': 'Pima'},
 {'code': '021', 'value_represented': 'Pinal'},
 {'code': '023', 'value_represented': 'Santa Cruz'},
 {'code': '025', 'value_represented': 'Yavapai'},
 {'code': '027', 'value_represented': 'Yuma'}]

In [42]:
# County Codes for Utah
url = list_url + 'countiesByState'
r = requests.get(url, params = {"email":email, "key":key, "state":"49"})
print(r.status_code)
r.json()['Data']
## Utah county is 049
## SLC county is 035

200


[{'code': '001', 'value_represented': 'Beaver'},
 {'code': '003', 'value_represented': 'Box Elder'},
 {'code': '005', 'value_represented': 'Cache'},
 {'code': '007', 'value_represented': 'Carbon'},
 {'code': '009', 'value_represented': 'Daggett'},
 {'code': '011', 'value_represented': 'Davis'},
 {'code': '013', 'value_represented': 'Duchesne'},
 {'code': '015', 'value_represented': 'Emery'},
 {'code': '017', 'value_represented': 'Garfield'},
 {'code': '019', 'value_represented': 'Grand'},
 {'code': '021', 'value_represented': 'Iron'},
 {'code': '023', 'value_represented': 'Juab'},
 {'code': '025', 'value_represented': 'Kane'},
 {'code': '027', 'value_represented': 'Millard'},
 {'code': '029', 'value_represented': 'Morgan'},
 {'code': '031', 'value_represented': 'Piute'},
 {'code': '033', 'value_represented': 'Rich'},
 {'code': '035', 'value_represented': 'Salt Lake'},
 {'code': '037', 'value_represented': 'San Juan'},
 {'code': '039', 'value_represented': 'Sanpete'},
 {'code': '041', '

I want the stations from 
- State 04 (Arizona), County 013 (Maricopa)  
- State 49 (Utah), County 049 (Utah) 
- State 49 (Utah), County 035 (Salt Lake) 

## Get AQI Pollutants

In [11]:
url = "https://aqs.epa.gov/data/api/list/parametersByClass"
params = {
    "email":email,
    "key":key,
    "pc": "AQI POLLUTANTS"}
pcs = requests.get(url, params=params)
print(pcs.ok)

True


In [12]:
pcs.json()['Data']

[{'code': '42101', 'value_represented': 'Carbon monoxide'},
 {'code': '42401', 'value_represented': 'Sulfur dioxide'},
 {'code': '42602', 'value_represented': 'Nitrogen dioxide (NO2)'},
 {'code': '44201', 'value_represented': 'Ozone'},
 {'code': '81102', 'value_represented': 'PM10 Total 0-10um STP'},
 {'code': '88101', 'value_represented': 'PM2.5 - Local Conditions'},
 {'code': '88502',
  'value_represented': 'Acceptable PM2.5 AQI & Speciation Mass'}]

# Get Data

In [7]:
# parameters that I want
pc = pd.DataFrame(pcs.json()['Data'])['code'][0:6]
slc = ('49', '035')
utc = ('49', '049')
mcc = ('04', '013')
loc = [slc,utc,mcc]
#loc = [utc,mcc]
years = ['2021','2020','2019','2018','2017']

In [8]:
# try for one combination of requests
url = "https://aqs.epa.gov/data/api/dailyData/byCounty"
params = {"email":email,"key":key,
          "param": pc[0],
          "bdate": years[0]+"0101",
          "edate": years[0]+"1231",
          "state": loc[0][0],
          "county": loc[0][1]}
            

In [9]:
r = requests.get(url, params=params)

In [10]:
r.ok

True

In [None]:
## Run loop through all parameter combinations of interest

In [157]:
# columns to keep
keep = ['state_code', 'county_code', 'site_number', 'parameter_code', 
       'latitude', 'longitude', 'parameter',
       'date_local',
       'event_type', 'observation_count',
       'observation_percent', 'validity_indicator', 'arithmetic_mean',
       'first_max_value', 'first_max_hour', 'aqi', 
       'local_site_name', 'state', 'county', 'city',
       'cbsa_code', 'cbsa']

In [158]:
#DF = pd.DataFrame()
for y in years:
    for l in loc:
        for p in pc:
            print((y,l,p))
            time.sleep(10)
            params = {"email":email,"key":key,"param": p,"bdate": y+"0101","edate": y+"1231","state": l[0],"county": l[1]}
            r = requests.get(url, params=params)
            if r.ok:
                if r.json()['Header'][0]['rows'] > 0:
                    df = pd.DataFrame(r.json()['Data'])
                    try: 
                        df = df[keep]
                        DF = pd.concat([df,DF])
                    except:
                        pass


('2021', ('49', '035'), '42101')
('2021', ('49', '035'), '42401')
('2021', ('49', '035'), '42602')
('2021', ('49', '035'), '44201')
('2021', ('49', '035'), '81102')
('2021', ('49', '035'), '88101')
('2021', ('49', '049'), '42101')
('2021', ('49', '049'), '42401')
('2021', ('49', '049'), '42602')
('2021', ('49', '049'), '44201')
('2021', ('49', '049'), '81102')
('2021', ('49', '049'), '88101')
('2021', ('04', '013'), '42101')
('2021', ('04', '013'), '42401')
('2021', ('04', '013'), '42602')
('2021', ('04', '013'), '44201')
('2021', ('04', '013'), '81102')
('2021', ('04', '013'), '88101')
('2020', ('49', '035'), '42101')
('2020', ('49', '035'), '42401')
('2020', ('49', '035'), '42602')
('2020', ('49', '035'), '44201')
('2020', ('49', '035'), '81102')
('2020', ('49', '035'), '88101')
('2020', ('49', '049'), '42101')
('2020', ('49', '049'), '42401')
('2020', ('49', '049'), '42602')
('2020', ('49', '049'), '44201')
('2020', ('49', '049'), '81102')
('2020', ('49', '049'), '88101')
('2020', (

In [159]:
DF.shape

(802618, 22)

In [165]:
DF.drop_duplicates(inplace=True, ignore_index=True)

In [166]:
DF.shape

(431028, 22)

In [169]:
DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431028 entries, 0 to 431027
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   state_code           431028 non-null  object 
 1   county_code          431028 non-null  object 
 2   site_number          431028 non-null  object 
 3   parameter_code       431028 non-null  object 
 4   latitude             431028 non-null  float64
 5   longitude            431028 non-null  float64
 6   parameter            431028 non-null  object 
 7   date_local           431028 non-null  object 
 8   event_type           431028 non-null  object 
 9   observation_count    431028 non-null  int64  
 10  observation_percent  431028 non-null  float64
 11  validity_indicator   431028 non-null  object 
 12  arithmetic_mean      431028 non-null  float64
 13  first_max_value      431028 non-null  float64
 14  first_max_hour       431028 non-null  int64  
 15  aqi              

In [172]:
# DF.to_csv('aqi_utah_arizona.csv', index=False)
# save data so I don't have to run again