## Time Series Analysis of US Air Quality by State and County

### Part II: Prepare webscraping url zip files for updated data set on city and county, and AQI components:

Author: Gem Ruby </br>
Date: April 2023

Reference weburl: https://aqs.epa.gov/aqsweb/airdata/daily_44201_1981.zip

In [None]:
#import libariries
import pandas as pd
import os
import zipfile

In [None]:
#connect to goolge colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#change directory
os.chdir('/content/drive/MyDrive/2022 - BrainStation/AirQuality_Capstone/Data')

In [None]:
#Define the code no for each particulate to download the files
Particulates = {'44201':'Ozone','42401':'SO2','42101':'CO','42602':'NO2','88101':'PM2.5_FRM','88502':'PM2.5_NON','81102':'PM10'}

In [None]:
# Loop through each key-value pair in the list
for key in Particulates:
     print(key, Particulates[key])

42401 SO2
42101 CO
42602 NO2
88101 PM2.5_FRM
88502 PM2.5_NON
81102 PM10


In [None]:
#import all necessary libraries
import pandas as pd
import requests
import zipfile
import os

In [None]:
#define function that will download AQI components
def download_and_concat(kv_pairs):
    # iterate over each key-value pair in the dictionary
    for key, value in kv_pairs.items():
        # create a folder with the same name as the key if it doesn't already exist
        folder_name = value.lower()
        os.makedirs(folder_name, exist_ok=True)
        
        # create a list to hold the URLs for each year
        urls = []
        years = list(range(2015, 2022))
        
        # iterate over each year and create the corresponding URL
        for year in years:
            url = 'https://aqs.epa.gov/aqsweb/airdata/daily_'+key.lower()+'_'+str(year)+'.zip'
            urls.append(url)
        
        # download the zip files for each year and save them in the folder with the key name
        dfs = []
        for url in urls:
            # get the filename from the URL
            filename = url.split('/')[-1]
            
            # download the zip file from the URL
            response = requests.get(url)
            
            # save the zip file in the folder with the key name
            with open(os.path.join(folder_name, filename), 'wb') as f:
                f.write(response.content)
            
            # unzip the file and extract all CSV files
            with zipfile.ZipFile(os.path.join(folder_name, filename), 'r') as zip_file:
                for member in zip_file.namelist():
                    # extract only CSV files
                    if member.endswith('.csv'):
                        csv_filename = os.path.basename(member)
                        # read the CSV file into a Pandas DataFrame
                        csv_data = pd.read_csv(zip_file.open(member))
                        dfs.append(csv_data)
        
        # concatenate all DataFrames into a single DataFrame
        combined_data = pd.concat(dfs, ignore_index=True)
        
        # save the combined DataFrame for this key to a CSV file
        combined_data.to_csv(os.path.join(folder_name, folder_name+'.csv'), index=False)


In [None]:
#downlad All data 
download_and_concat(Particulates)

In [None]:
def download_and_concat2(kv_pairs):
    # iterate over each key-value pair in the dictionary
    for key, value in kv_pairs.items():
        # create a folder with the same name as the key if it doesn't already exist
        folder_name = value.lower()
        os.makedirs(folder_name, exist_ok=True)
        
        # create a list to hold the URLs for each year
        urls = []
        years = list(range(2015, 2022))
        
        # iterate over each year and create the corresponding URL
        for year in years:
            url = 'https://aqs.epa.gov/aqsweb/airdata/daily_aqi_by_'+key.lower()+'_'+str(year)+'.zip'
            urls.append(url)
        
        # download the zip files for each year and save them in the folder with the key name
        dfs = []
        for url in urls:
            # get the filename from the URL
            filename = url.split('/')[-1]
            
            # download the zip file from the URL
            response = requests.get(url)
            
            # save the zip file in the folder with the key name
            with open(os.path.join(folder_name, filename), 'wb') as f:
                f.write(response.content)
            
            # unzip the file and extract all CSV files
            with zipfile.ZipFile(os.path.join(folder_name, filename), 'r') as zip_file:
                for member in zip_file.namelist():
                    # extract only CSV files
                    if member.endswith('.csv'):
                        csv_filename = os.path.basename(member)
                        # read the CSV file into a Pandas DataFrame
                        csv_data = pd.read_csv(zip_file.open(member))
                        dfs.append(csv_data)
        
        # concatenate all DataFrames into a single DataFrame
        combined_data = pd.concat(dfs, ignore_index=True)
        
        # save the combined DataFrame for this key to a CSV file
        combined_data.to_csv(os.path.join(folder_name, folder_name+'.csv'), index=False)


In [None]:
#pull Daily AQI by CBSA, County 2015 through 2022 (smaller and updated dataset)

Updated = {'cbsa':'CBSA','county':'COUNTY'}

In [None]:
#pull additional dat from 2015 through 2022
download_and_concat2(Updated)

In [None]:
#pull Meteorological Information ()

Meteorological = {'WIND':'Wind','TEMP':'Temperature', 'PRESS':'Barometric_Pressure', 'RH_DP':'RH_and_Dewpoint'}

In [None]:
#download meteorological information
def download_and_concat3(kv_pairs):
    # iterate over each key-value pair in the dictionary
    for key, value in kv_pairs.items():
        # create a folder with the same name as the key if it doesn't already exist
        folder_name = value.lower()
        os.makedirs(folder_name, exist_ok=True)
        
        # create a list to hold the URLs for each year
        urls = []
        years = list(range(2015, 2022))
        
        # iterate over each year and create the corresponding URL
        for year in years:
            url = 'https://aqs.epa.gov/aqsweb/airdata/daily_'+key.upper()+'_'+str(year)+'.zip'
            urls.append(url)
        
        # download the zip files for each year and save them in the folder with the key name
        dfs = []
        for url in urls:
            # get the filename from the URL
            filename = url.split('/')[-1]
            
            # download the zip file from the URL
            response = requests.get(url)
            
            # save the zip file in the folder with the key name
            with open(os.path.join(folder_name, filename), 'wb') as f:
                f.write(response.content)
            
            # unzip the file and extract all CSV files
            with zipfile.ZipFile(os.path.join(folder_name, filename), 'r') as zip_file:
                for member in zip_file.namelist():
                    # extract only CSV files
                    if member.endswith('.csv'):
                        csv_filename = os.path.basename(member)
                        # read the CSV file into a Pandas DataFrame
                        csv_data = pd.read_csv(zip_file.open(member))
                        dfs.append(csv_data)
        
        # concatenate all DataFrames into a single DataFrame
        combined_data = pd.concat(dfs, ignore_index=True)
        
        # save the combined DataFrame for this key to a CSV file
        combined_data.to_csv(os.path.join(folder_name, folder_name+'.csv'), index=False)

In [None]:
#download and combine
download_and_concat3(Meteorological)

# Next steps: 
- We willl be cleaning and modeling the data once all the information have been entered into the system. 