In [14]:
import os
from six.moves import urllib
import zipfile
import pandas as pd

DOWNLOAD_EPA_ROOT = "https://aqs.epa.gov/aqsweb/airdata/annual_aqi_by_county_"
AQI_BASE_NAME = "annual_aqi_by_county_"
EPA_PATH = "epa_data"

def fetch_annual_aqi_data(years, epa_root=DOWNLOAD_EPA_ROOT, data_path=EPA_PATH):
    if not os.path.isdir(data_path):
        os.makedirs(data_path)
        
    data_frames = []
    for year in years:
        # build download url and download zip file
        aqi_url = f"{epa_root}{year}.zip"
        aqi_name = f"{AQI_BASE_NAME}{year}.zip"
        zip_path = os.path.join(data_path, aqi_name)
        urllib.request.urlretrieve(aqi_url, zip_path)
        
        # extract zip file
        zip_ref = zipfile.ZipFile(zip_path, 'r')
        zip_ref.extractall(data_path)
        zip_ref.close()
        
        # read csv
        aqi_csv_name = f"{AQI_BASE_NAME}{year}.csv"
        aqi_csv_path = os.path.join(data_path, aqi_csv_name)
        data_frames.append(pd.read_csv(aqi_csv_path))
        
    # write out combined data
    all_csv_name = "all_years.csv"
    all_csv_path = os.path.join(data_path, all_csv_name)
    
    all_df = pd.concat(data_frames, ignore_index=True)
    all_df.to_csv(all_csv_path)
    
    return all_df
        

In [12]:
# This is the reference to the small area census data and the years available
#https://www.census.gov/cgi-bin/nbroker?_service=sas_serv1&_debug=0&_program=cedr.sasapp_main.sas&s_output=csv&s_orderBy=id%20asc,id%20asc,%20year%20desc&s_appName=saipe&menu=grid_proxy&s_year=2017,2016,2015,2013,2014,2012,2010,2011,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999,1998,1997,1996,1995,1993,1989&s_state=&s_county=&s_district=&s_USStOnly=n&s_inclUsTot=n&s_inclStTot=n&s_measures=aa_snc

all_years = fetch_annual_aqi_data([2017,2016,2015,2013,2014,2012,2010,2011,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000,1999,1998,1997,1996,1995,1993,1989])

In [13]:
all_years.shape

(26385, 19)

In [18]:
small_area_df = pd.read_csv("SAIPESNC_10APR19_19_47_46_33.csv")

In [19]:
small_area_df.shape

(78543, 44)

In [17]:
all_years.head()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days SO2,Days PM2.5,Days PM10
0,Alabama,Baldwin,2017,270,241,28,1,0,0,0,108,51,36,0,0,206,0,64,0
1,Alabama,Clay,2017,118,104,14,0,0,0,0,66,52,30,0,0,0,0,118,0
2,Alabama,Colbert,2017,283,265,18,0,0,0,0,63,48,37,0,0,218,0,65,0
3,Alabama,DeKalb,2017,359,329,30,0,0,0,0,80,50,39,0,0,315,0,44,0
4,Alabama,Elmore,2017,226,221,5,0,0,0,0,58,45,35,0,0,226,0,0,0


In [40]:
small_area_df.loc[0, 'State / County Name'].replace(' County','')

'Autauga (AL)'

In [47]:
f"{all_years.loc[0,'County']} ({us_state_abbrev[all_years.loc[0,'State']]})"

'Baldwin (AL)'

In [51]:
def combine_state_county (df):
    new_column = []
    for index, row in df.iterrows():
        new_column.append(f"{df.loc[index,'County']} County, ({us_state_abbrev[df.loc[index,'State']]})")

    return new_column

In [52]:
combine_state_county(all_years)

KeyError: 'Country Of Mexico'

In [22]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
}

In [24]:
small_area_df.shape

(78543, 44)

In [39]:
small_area_df['State / County Name'].split()

0         [Autauga, County, (AL)]
1         [Autauga, County, (AL)]
2         [Autauga, County, (AL)]
3         [Autauga, County, (AL)]
4         [Autauga, County, (AL)]
5         [Autauga, County, (AL)]
6         [Autauga, County, (AL)]
7         [Autauga, County, (AL)]
8         [Autauga, County, (AL)]
9         [Autauga, County, (AL)]
10        [Autauga, County, (AL)]
11        [Autauga, County, (AL)]
12        [Autauga, County, (AL)]
13        [Autauga, County, (AL)]
14        [Autauga, County, (AL)]
15        [Autauga, County, (AL)]
16        [Autauga, County, (AL)]
17        [Autauga, County, (AL)]
18        [Autauga, County, (AL)]
19        [Autauga, County, (AL)]
20        [Autauga, County, (AL)]
21        [Autauga, County, (AL)]
22        [Autauga, County, (AL)]
23        [Autauga, County, (AL)]
24        [Autauga, County, (AL)]
25        [Baldwin, County, (AL)]
26        [Baldwin, County, (AL)]
27        [Baldwin, County, (AL)]
28        [Baldwin, County, (AL)]
29        [Bal