In [17]:
#import dependencies
import matplotlib.pyplot as plot
import pandas as pd
import numpy as np

### Data Cleaning Process ###

* Create a list and a function that will allow us to save time
* Read csv files and append related tables to one another
* Rename column for easier reading 
* Add date columns that will be usefull when its time to plot

In [18]:
# import datetime
#create a list of our pollutants to use as identifiers within our function
pollutants = ["CO","NO2","PM2.5"]
#create a function to append dataframes based on the pollutant and year
#will allow us to assign independent variables to each dataframe
def pull(p):
    #read into each csv based on the pollutant and assign it a dataframe name
    df_2018 = pd.read_csv(f"resources/{p}_2018.csv")
    df_2019 = pd.read_csv(f"resources/{p}_2019.csv")
    df_2020 = pd.read_csv(f"resources/{p}_2020.csv")
    #append the 3 separate pollutant dataframes into one
    df = df_2018.append([df_2019,df_2020])
    #rename the name of the pollutant column
    df = df.rename(columns={"AQS_PARAMETER_DESC":"POLLUTANT"})
    #make the abbreviation the name of our pollutant
    df["ABBREVIATION"] = p
    #format our date columns
    df["Date"] = pd.to_datetime(df["Date"])
    df["MONTH"] = df["Date"].dt.month
    df["DAY"] = df["Date"].dt.day
    df["YEAR"] = df["Date"].dt.year
    #select the columns we want
    #the time period measurement of each pollutant varies, so must use df.iloc to select the column number instead of the column name
    df = df.iloc[:,np.r_[0,4:8,9,11,18:24]]
    #select the rows that we want to match the time period of traffic and that of air quality together, drop rows from 2018 Jan to 2018 Feb
    df = df.drop(df[(df["YEAR"]==2018)&((df["MONTH"]==1)|(df["MONTH"]==2))].index)
    df = df.drop(df[(df["YEAR"]==2020)&((df["MONTH"]==7)|(df["MONTH"]==6)|(df["MONTH"]==5))].index)
    return df

CO_df=pull(pollutants[0])
NO2_df=pull(pollutants[1])
PM25_df=pull(pollutants[2])

CO_df["Site Name"].value_counts()
CO_df

Unnamed: 0,Date,Daily Max 8-hour CO Concentration,UNITS,DAILY_AQI_VALUE,Site Name,PERCENT_COMPLETE,POLLUTANT,SITE_LATITUDE,SITE_LONGITUDE,ABBREVIATION,MONTH,DAY,YEAR
52,2018-03-07,0.3,ppm,3,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,3,7,2018
53,2018-03-08,0.6,ppm,7,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,3,8,2018
54,2018-03-09,0.6,ppm,7,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,3,9,2018
55,2018-03-10,0.5,ppm,6,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,3,10,2018
56,2018-03-11,0.3,ppm,3,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,3,11,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,2020-04-26,0.2,ppm,2,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,4,26,2020
214,2020-04-27,0.3,ppm,3,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,4,27,2020
215,2020-04-28,0.3,ppm,3,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,4,28,2020
216,2020-04-29,0.2,ppm,2,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,4,29,2020


In [23]:
#select only monitor checks that were 100% complete for the day to avoid having incomplete data
co_clean_df=CO_df.loc[CO_df["PERCENT_COMPLETE"] == 100.0,:]

#create a groupby function by the year and the month to get monthly data
co_time_group=co_clean_df.groupby(["YEAR","MONTH"])
co_monthly_mean=co_time_group['Daily Max 8-hour CO Concentration'].mean()  

co_clean_df["Date"]=co_clean_df['Date'].dt.date
% store co_monthly_mean
co_clean_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Date,Daily Max 8-hour CO Concentration,UNITS,DAILY_AQI_VALUE,Site Name,PERCENT_COMPLETE,POLLUTANT,SITE_LATITUDE,SITE_LONGITUDE,ABBREVIATION,MONTH,DAY,YEAR
52,2018-03-07,0.3,ppm,3,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,3,7,2018
53,2018-03-08,0.6,ppm,7,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,3,8,2018
54,2018-03-09,0.6,ppm,7,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,3,9,2018
55,2018-03-10,0.5,ppm,6,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,3,10,2018
56,2018-03-11,0.3,ppm,3,NORTHBROOK WATER PLANT,100.0,Carbon monoxide,42.139996,-87.799227,CO,3,11,2018


In [24]:
#select only monitor checks that were 100% complete for the day to avoid having incomplete data
no2_clean_df=NO2_df.loc[NO2_df["PERCENT_COMPLETE"] == 100.0,:]

#create a groupby function by the year and the month to get monthly data
no2_time_group=no2_clean_df.groupby(["YEAR","MONTH"])
no2_monthly_mean=no2_time_group['Daily Max 1-hour NO2 Concentration'].mean()  

no2_clean_df["Date"]=no2_clean_df['Date'].dt.date
% store no2_monthly_mean
no2_clean_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Date,Daily Max 1-hour NO2 Concentration,UNITS,DAILY_AQI_VALUE,Site Name,PERCENT_COMPLETE,POLLUTANT,SITE_LATITUDE,SITE_LONGITUDE,ABBREVIATION,MONTH,DAY,YEAR
48,2018-03-01,44.2,ppb,42,COM ED MAINTENANCE BLDG,100.0,Nitrogen dioxide (NO2),41.7514,-87.713488,NO2,3,1,2018
49,2018-03-02,77.3,ppb,76,COM ED MAINTENANCE BLDG,100.0,Nitrogen dioxide (NO2),41.7514,-87.713488,NO2,3,2,2018
50,2018-03-03,83.8,ppb,82,COM ED MAINTENANCE BLDG,100.0,Nitrogen dioxide (NO2),41.7514,-87.713488,NO2,3,3,2018
51,2018-03-04,32.1,ppb,30,COM ED MAINTENANCE BLDG,100.0,Nitrogen dioxide (NO2),41.7514,-87.713488,NO2,3,4,2018
52,2018-03-05,21.8,ppb,20,COM ED MAINTENANCE BLDG,100.0,Nitrogen dioxide (NO2),41.7514,-87.713488,NO2,3,5,2018


In [21]:
#select only monitor checks that were 100% complete for the day to avoid having incomplete data
PM25_df_clean_df=PM25_df.loc[PM25_df["PERCENT_COMPLETE"] == 100.0,:]

#create a groupby function by the year and the month to get monthly data
PM25_df_time_group=PM25_df_clean_df.groupby(["YEAR","MONTH"])
PM25_df_monthly_mean=PM25_df_time_group['Daily Mean PM2.5 Concentration'].mean()  

PM25_df_clean_df["Date"]=PM25_df_clean_df['Date'].dt.date
% store PM25_df_monthly_mean
PM25_df_clean_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Date,Daily Mean PM2.5 Concentration,UNITS,DAILY_AQI_VALUE,Site Name,PERCENT_COMPLETE,POLLUTANT,SITE_LATITUDE,SITE_LONGITUDE,ABBREVIATION,MONTH,DAY,YEAR
9,2018-03-01,8.5,ug/m3 LC,35,VILLAGE GARAGE,100.0,PM2.5 - Local Conditions,41.670992,-87.732457,PM2.5,3,1,2018
10,2018-03-03,5.6,ug/m3 LC,23,VILLAGE GARAGE,100.0,PM2.5 - Local Conditions,41.670992,-87.732457,PM2.5,3,3,2018
11,2018-03-09,8.7,ug/m3 LC,36,VILLAGE GARAGE,100.0,PM2.5 - Local Conditions,41.670992,-87.732457,PM2.5,3,9,2018
12,2018-03-21,2.0,ug/m3 LC,8,VILLAGE GARAGE,100.0,PM2.5 - Local Conditions,41.670992,-87.732457,PM2.5,3,21,2018
13,2018-03-27,9.1,ug/m3 LC,38,VILLAGE GARAGE,100.0,PM2.5 - Local Conditions,41.670992,-87.732457,PM2.5,3,27,2018
