In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
if 'COLAB_GPU' in os.environ:
    from google.colab import  drive
    drive.mount('/drive')
    data_path = '/drive/Shared drives/Capstone/notebooks/data'
else:
    data_path = 'data'


### PREDICTOR: ENVIRONMENTAL QUALITY
**Metric:** Air quality index

The air quality index, or AQI, is an index for reporting daily air quality. It tells how clean or polluted the
air is and what associated health effects might be a concern in the community. The AQI includes five
major air pollutants regulated by the Clean Air Act: ground-level ozone, particle pollution (also known
as particulate matter), carbon monoxide, sulfur dioxide, and nitrogen dioxide. For each of these
pollutants, the Environmental Protection Agency has established national air quality standards to
protect public health. Ground-level ozone and airborne particles are the two pollutants that pose the
greatest threat to human health in the US. Values range from 0 to 500 and are categorized into a six-
point scale: good, moderate, unhealthy for sensitive groups, unhealthy, very unhealthy, and hazardous.

**Source:** https://aqs.epa.gov/aqsweb/airdata/annual_aqi_by_county_2019.zip  from https://aqs.epa.gov/aqsweb/airdata/download_files.html 

**Documentation:** https://aqs.epa.gov/aqsweb/airdata/FileFormats.html#_daily_summary_files 

**Notes:** Calculated the average of scores across each county. 


In [3]:
aqi_df =  pd.read_csv(f'{data_path}/raw/annual_aqi_by_county_2019.csv')

In [4]:
aqi_df = aqi_df.rename(columns={'Median AQI': 'AQI'})

In [5]:
aqi_df['County'] = aqi_df['County'].str.upper()
aqi_df.head()

Unnamed: 0,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,Alabama,BALDWIN,2019,271,237,34,0,0,0,0,80,52,37,0,0,220,51,0
1,Alabama,CLAY,2019,107,97,10,0,0,0,0,67,50,30,0,0,0,107,0
2,Alabama,COLBERT,2019,263,252,11,0,0,0,0,61,47,37,0,0,228,35,0
3,Alabama,DEKALB,2019,361,324,37,0,0,0,0,90,51,39,0,0,331,30,0
4,Alabama,ELMORE,2019,228,208,20,0,0,0,0,100,50,39,0,0,228,0,0


In [6]:
aqi_fips_df = aqi_df.copy()


In [7]:
aqi_fips_df.columns

Index(['State', 'County', 'Year', 'Days with AQI', 'Good Days',
       'Moderate Days', 'Unhealthy for Sensitive Groups Days',
       'Unhealthy Days', 'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
       '90th Percentile AQI', 'AQI', 'Days CO', 'Days NO2', 'Days Ozone',
       'Days PM2.5', 'Days PM10'],
      dtype='object')

In [8]:
crosswalk_df = pd.read_csv(f'{data_path}/processed/state_county_fips.csv')

In [9]:
crosswalk_df.head()

Unnamed: 0.1,Unnamed: 0,NAME,state,county,County_Name,State_Name,State_Abbreviation,State_Abbreviation_County,FIPS
0,1,"Sebastian County, Arkansas",5,131,SEBASTIAN,Arkansas,AR,AR-SEBASTIAN,5131
1,2,"Sevier County, Arkansas",5,133,SEVIER,Arkansas,AR,AR-SEVIER,5133
2,3,"Sharp County, Arkansas",5,135,SHARP,Arkansas,AR,AR-SHARP,5135
3,4,"Stone County, Arkansas",5,137,STONE,Arkansas,AR,AR-STONE,5137
4,5,"Union County, Arkansas",5,139,UNION,Arkansas,AR,AR-UNION,5139


In [10]:
aqi_df = aqi_df.merge(crosswalk_df, how='left', left_on=['County', 'State'], right_on=['County_Name', 'State_Name'])
aqi_df.sample(5)

Unnamed: 0.1,State,County,Year,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,...,Days PM10,Unnamed: 0,NAME,state,county,County_Name,State_Name,State_Abbreviation,State_Abbreviation_County,FIPS
783,South Carolina,AIKEN,2019,265,230,35,0,0,0,0,...,0,2360.0,"Aiken County, South Carolina",45.0,3.0,AIKEN,South Carolina,SC,SC-AIKEN,45003.0
606,North Carolina,DAVIDSON,2019,350,272,78,0,0,0,0,...,0,1108.0,"Davidson County, North Carolina",37.0,57.0,DAVIDSON,North Carolina,NC,NC-DAVIDSON,37057.0
892,Vermont,RUTLAND,2019,365,322,43,0,0,0,0,...,1,1705.0,"Rutland County, Vermont",50.0,21.0,RUTLAND,Vermont,VT,VT-RUTLAND,50021.0
635,North Dakota,BURKE,2019,365,354,11,0,0,0,0,...,1,1050.0,"Burke County, North Dakota",38.0,13.0,BURKE,North Dakota,ND,ND-BURKE,38013.0
755,Pennsylvania,LEHIGH,2019,360,293,64,3,0,0,0,...,2,1957.0,"Lehigh County, Pennsylvania",42.0,77.0,LEHIGH,Pennsylvania,PA,PA-LEHIGH,42077.0


In [11]:
nonas_df = aqi_df[['FIPS', 'AQI']]
# print(len(aqi_fips_df))
nonas_df.dropna(subset=['FIPS'], inplace=True)

In [12]:
nonas_df.to_csv(f'{data_path}/processed/air_quality_index.csv', index=False)