## Time Series Analysis of US Air Quality by State and County

### Part V: Finalizing Data

Author: Gem Ruby </br>
Date: April 2023

In [None]:
#import applicable libraries
import numpy as np
import pandas as pd
import os
import requests
import warnings
warnings.filterwarnings("ignore")

# plotting
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#mount to google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#change directory
os.chdir('/content/drive/MyDrive/2022 - BrainStation/AirQuality_Capstone')

In [None]:
#read in complete
aqi = pd.read_csv('/content/drive/MyDrive/2022 - BrainStation/AirQuality_Capstone/aqi_daily_1980_to_2021.csv')

In [None]:
#read in the dataframe
sub_aqi = pd.read_csv('/content/drive/MyDrive/2022 - BrainStation/AirQuality_Capstone/Data/county 2015-2022.csv')

In [None]:
#change dataframe 
aqi['Date'] = pd.to_datetime(aqi['Date'])

sub_aqi['Date'] = pd.to_datetime(sub_aqi['Date'])

## I. Interpolation of missing AQI values

In [None]:
#drop all the unecessary values
sub_aqi = sub_aqi.drop(['State Code', 'County Code','Category','Defining Site','Number of Sites Reporting'], axis=1)

In [None]:
#confirm draft 
sub_aqi.info(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2271501 entries, 0 to 2271500
Data columns (total 5 columns):
 #   Column              Dtype         
---  ------              -----         
 0   State Name          object        
 1   county Name         object        
 2   Date                datetime64[ns]
 3   AQI                 int64         
 4   Defining Parameter  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 86.7+ MB


In [None]:
#run all the State and County combinations:
current_df = []

# Get a list of unique State Name and County combinations
state_county_combinations = sub_aqi[['State Name', 'county Name']].drop_duplicates()

# Create an empty dataframe to store the interpolated data
interpolated_data = pd.DataFrame(columns=sub_aqi.columns)

# Loop through each combination
for i, row in state_county_combinations.iterrows():
    state_name = row['State Name']
    county_name = row['county Name']
    
    # Filter the DataFrame for the current state and county combination
    current_df = sub_aqi[(sub_aqi['State Name'] == state_name) & (sub_aqi['county Name'] == county_name)].copy()
    
    # Create a DataFrame with all possible dates for the current state and county combination
    min_date = current_df['Date'].min()
    max_date = current_df['Date'].max()
    all_dates = pd.date_range(start=min_date, end=max_date, freq='D')
    all_dates_df = pd.DataFrame({'Date': all_dates})
    
    # Merge the current DataFrame with the full date range to add missing dates
    current_df = pd.merge(current_df, all_dates_df, on='Date', how='right')

    # Fill NaN values for State Name and county Name
    current_df['State Name'].fillna(state_name, inplace=True)
    current_df['county Name'].fillna(county_name, inplace=True)
    current_df['Defining Parameter'].fillna('EST', inplace=True)

    # Interpolate missing AQI values using linear interpolation
    current_df['AQI'] = current_df['AQI'].interpolate(method='linear', option = 'spline')
    
    # Append the interpolated values to the new DataFrame
    interpolated_data = pd.concat([interpolated_data, current_df])
    
# Sort the interpolated data by State Name, county Name, and Date
interpolated_data = interpolated_data.sort_values(by=['State Name', 'county Name', 'Date'])

# Reset the index of the DataFrame
interpolated_data = interpolated_data.reset_index(drop=True)

In [None]:
#confirm interpolated data
interpolated_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2580199 entries, 0 to 2580198
Data columns (total 5 columns):
 #   Column              Dtype         
---  ------              -----         
 0   State Name          object        
 1   county Name         object        
 2   Date                datetime64[ns]
 3   AQI                 float64       
 4   Defining Parameter  object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 98.4+ MB


In [None]:
#see missing data 
interpolated_data.tail(10)

Unnamed: 0,State Name,county Name,Date,AQI,Defining Parameter
2580189,Wyoming,Weston,2021-12-22,39.0,Ozone
2580190,Wyoming,Weston,2021-12-23,35.0,Ozone
2580191,Wyoming,Weston,2021-12-24,34.0,Ozone
2580192,Wyoming,Weston,2021-12-25,38.0,Ozone
2580193,Wyoming,Weston,2021-12-26,34.0,Ozone
2580194,Wyoming,Weston,2021-12-27,34.0,Ozone
2580195,Wyoming,Weston,2021-12-28,35.0,Ozone
2580196,Wyoming,Weston,2021-12-29,34.0,Ozone
2580197,Wyoming,Weston,2021-12-30,36.0,Ozone
2580198,Wyoming,Weston,2021-12-31,31.0,Ozone


In [None]:
#define category based on interpolation
# aqi_ranges = [0, 51, 101, 151, 201, 301, np.inf]
# aqi_categories = ['Good', 'Moderate','Unhealthy for Sensitive Groups','Unhealthy','Very Unhealthy','Hazardous']
# aqi_dict = dict(zip(pd.IntervalIndex.from_breaks(aqi_ranges[:-1]), aqi_categories))

aqi_dict = {
    pd.Interval(left=-np.inf, right=50.00): 'Good',
    pd.Interval(left=50.01, right=100): 'Moderate',
    pd.Interval(left=100.01, right=150): 'Unhealthy for Sensitive Groups',
    pd.Interval(left=150.01, right=200): 'Unhealthy',
    pd.Interval(left=200.01, right=300): 'Very Unhealthy',
    pd.Interval(left=300.01, right=np.inf): 'Hazardous'
}

#fill only NaN values
interpolated_data['Category'] = interpolated_data['AQI'].map(aqi_dict)

In [None]:
#confirm all the NaN values were placed
interpolated_data.head(3)

Unnamed: 0,State Name,county Name,Date,AQI,Defining Parameter,Category
0,Alabama,Baldwin,2015-01-03,28.0,PM2.5,Good
1,Alabama,Baldwin,2015-01-04,34.666667,EST,Good
2,Alabama,Baldwin,2015-01-05,41.333333,EST,Good


In [None]:
# save the dataframe into google drive
interpolated_data.to_csv('/content/drive/MyDrive/2022 - BrainStation/AirQuality_Capstone/Data/county 2015-2022(interpolated).csv', index=False)

## II. Importing Forecast into the final dataframe

In [None]:
#read in dataframe
forecasted = pd.read_csv('/content/drive/MyDrive/2022 - BrainStation/AirQuality_Capstone/Data/Forecasted_AQI.csv')

In [None]:
#info
forecasted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7609 entries, 0 to 7608
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    7609 non-null   object 
 1   State   7609 non-null   object 
 2   County  7609 non-null   object 
 3   AQI     7609 non-null   float64
dtypes: float64(1), object(3)
memory usage: 237.9+ KB


In [None]:
# Set Date column as index and reset index to create new index
forecasted = forecasted.set_index(['State', 'County', 'Date']).reset_index()

# Rename columns
forecasted = forecasted.rename(columns={'State': 'State Name', 'County': 'county Name'})

# Move Date column to index 2
cols = list(forecasted.columns)
cols.remove('Date')
cols.insert(2, 'Date')
forecasted = forecasted.loc[:, cols]

In [None]:
forecasted.head(10)

Unnamed: 0,State Name,county Name,Date,AQI
0,Alabama,Baldwin,2021-12-31,27.210186
1,Alabama,Baldwin,2022-01-01,28.588186
2,Alabama,Baldwin,2022-01-02,29.524904
3,Alabama,Baldwin,2022-01-03,30.161652
4,Alabama,Baldwin,2022-01-04,30.594493
5,Alabama,Baldwin,2022-01-05,30.888723
6,Alabama,Baldwin,2022-01-06,31.08873
7,Alabama,Clay,2021-12-31,25.108759
8,Alabama,Clay,2022-01-01,25.802901
9,Alabama,Clay,2022-01-02,26.701189


In [None]:
#fill in to track forecast in the defining parameter
forecasted['Defining Parameter'] = 'FCAST'


In [None]:
#fill in Category based on defined dictionary
forecasted['Category'] = forecasted['AQI'].map(aqi_dict)

In [None]:
#check for forecast before October 2022
new_df = forecasted[forecasted['Date'] < '2021-01-01']

#check the new df
new_df

Unnamed: 0,State Name,county Name,Date,AQI,Defining Parameter,Category
14,Alabama,Colbert,2019-11-01,27.268600,FCAST,Good
15,Alabama,Colbert,2019-11-02,29.046223,FCAST,Good
16,Alabama,Colbert,2019-11-03,30.121091,FCAST,Good
17,Alabama,Colbert,2019-11-04,31.746906,FCAST,Good
18,Alabama,Colbert,2019-11-05,32.162250,FCAST,Good
...,...,...,...,...,...,...
7520,Wyoming,Goshen,2016-12-21,37.627043,FCAST,Good
7521,Wyoming,Goshen,2016-12-22,36.601653,FCAST,Good
7522,Wyoming,Goshen,2016-12-23,36.036767,FCAST,Good
7523,Wyoming,Goshen,2016-12-24,35.725572,FCAST,Good


In [None]:
#drop rows that meet the above criteria (old forecast)
filtered_df = forecasted[forecasted['Date'] > '2021-10-01']

In [None]:
#check
filtered_df.head(1)

Unnamed: 0,State Name,county Name,Date,AQI,Defining Parameter,Category
0,Alabama,Baldwin,2021-12-31,27.210186,FCAST,Good


In [None]:
interpolated_data.head(1)

Unnamed: 0,State Name,county Name,Date,AQI,Defining Parameter,Category
0,Alabama,Baldwin,2015-01-03,28.0,PM2.5,Good


In [None]:
# Concatenate the filtered data with the interpolated data
merged_df = pd.concat([filtered_df, interpolated_data])

# Reset the index and sort by State Name, county Name, and Date
merged_df = merged_df.reset_index(drop=True).sort_values(by=['State Name', 'county Name', 'Date'])

In [None]:
#final dataframe check
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2587188 entries, 6989 to 6988
Data columns (total 6 columns):
 #   Column              Dtype  
---  ------              -----  
 0   State Name          object 
 1   county Name         object 
 2   Date                object 
 3   AQI                 float64
 4   Defining Parameter  object 
 5   Category            object 
dtypes: float64(1), object(5)
memory usage: 138.2+ MB


In [None]:
#change dataframe 
merged_df['Date'] = pd.to_datetime(merged_df['Date'])

In [None]:
# save the dataframe into google drive
merged_df.to_csv('/content/drive/MyDrive/2022 - BrainStation/AirQuality_Capstone/Data/county 2015-2022_FINAL.csv', index=False)

In [None]:
#check the shape of the dataframe
merged_df.shape

(2587188, 6)

#CONCLUSION
</br>
The new dataframe will now consist of old data from the EPA website and the forecasted 7 day value. Further analysis will be conducted after.


