In [111]:
import pandas as pd

pollution = pd.read_csv('./data/pollution_us_2000_2016.csv.tar.gz')
pollution = pollution.drop(pollution.columns[0], axis=1)
pollution.set_index(['Site Num', 'Date Local'], inplace=True)

# Is the Dataset tidy?

In [112]:
pollution.tail(10)
pollution.loc[(1, '2000-01-03')]

  from ipykernel import kernelapp as app


Unnamed: 0_level_0,Unnamed: 1_level_0,State Code,County Code,Address,State,County,City,NO2 Units,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,...,SO2 Units,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
Site Num,Date Local,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2000-01-03,6,73,"80 E. 'J' ST., CHULA VISTA",California,San Diego,Chula Vista,Parts per billion,27.782609,45.0,17,...,Parts per billion,2.173913,3.0,8,4.0,Parts per million,1.252174,2.8,20,
1,2000-01-03,6,73,"80 E. 'J' ST., CHULA VISTA",California,San Diego,Chula Vista,Parts per billion,27.782609,45.0,17,...,Parts per billion,2.173913,3.0,8,4.0,Parts per million,1.095833,1.8,22,20.0
1,2000-01-03,6,73,"80 E. 'J' ST., CHULA VISTA",California,San Diego,Chula Vista,Parts per billion,27.782609,45.0,17,...,Parts per billion,2.171429,3.0,20,,Parts per million,1.252174,2.8,20,
1,2000-01-03,6,73,"80 E. 'J' ST., CHULA VISTA",California,San Diego,Chula Vista,Parts per billion,27.782609,45.0,17,...,Parts per billion,2.171429,3.0,20,,Parts per million,1.095833,1.8,22,20.0


At first sight, it seems tidy.
Each variable has its seperate column, each measurement per station per day has its own row, and there is only one type of data object (measurement per station) in the dataset.

HOWEVER, there are lots of duplicates! For example, there are 4 observations for Site 3002 for the date 2000-01-01. We double-checked if the dataset was containing additional information (eg. one measurement for morning, noon, evening and night) that did not get correctly parsed, like a datetime object parsed to a date object, but that wasn't the case. We will therefore have to deal with these duplicates first before we do further analysis.

# Duplicate Handling
As we already saw, there are a lot of duplicates in this dataset.
To deal with them, we shall use the following strategy:
- A duplicate is an observation from the same station on the same date.
- For every pollutant, there are 4 columns (apart from the measurement scale, which seems to be the same for the duplicate observations):
    - "&lt;pollutant&gt; Mean" is, well, the mean of this day's measurements. To unify those, we will take the mean of any non-NaN measurements of the duplicate rows (the mean of means so to say).
    - "&lt;pollutant&gt; 1st Max Value" is the maximum measured value of that day. To merge this column, we will take the maximum of the duplicate values (to pick the "actual" highest value of that day).
    - "&lt;pollutant&gt; 1st Max Hour" is the hour in which the maximum value was measured. Here we take the value of the observation from which we sourced the maximum "&lt;pollutant&gt; 1st Max Value".
    - "&lt;pollutant&gt; AQI" is the so-called Air Quality Index, which is a non-linear scale and should probably not be thoughtlessly averaged. We will therefore calculate it anew for the merged duplicate observations.

In [114]:
# We needed some performance-efficient way of taking the max of "1st Max Value" while also 
# keeping the accompanying "1st Max Hour", which turned out to be quite difficult

# The most efficient solution was coupling the two values into a tuple column,
# max-aggregating this column and then splitting it back up into two separate columns
for pollutant in ['NO2', 'O3', 'SO2', 'CO']:
    pollution[pollutant + ' Zipped'] = list(zip(pollution[pollutant + ' 1st Max Value'], pollution[pollutant + ' 1st Max Hour']))

# Group by all the non-pollutant columns and aggregate all but the AQI
pollution_grouped = pollution.groupby(['Site Num', 'Date Local', 'State', 'County', 'City', 'Address'])
pollution_dedup = pollution_grouped.agg({
    'NO2 Mean': 'mean',
    'NO2 Zipped': 'max',
    'O3 Mean': 'mean',
    'O3 Zipped': 'max',
    'SO2 Mean': 'mean',
    'SO2 Zipped': 'max',
    'CO Mean': 'mean',
    'CO Zipped': 'max',
}).reset_index()

# Split the tuple-columns back into two separate 
for pollutant in ['NO2', 'O3', 'SO2', 'CO']:
    pollution_dedup[[pollutant + ' 1st Max Value', pollutant + ' 1st Max Hour']] = pollution_dedup[pollutant + ' Zipped'].apply(pd.Series)

Unnamed: 0,Site Num,Date Local,State,County,City,Address,NO2 Mean,NO2 Zipped,O3 Mean,O3 Zipped,...,CO Mean,CO Zipped,NO2 1st Max Value,NO2 1st Max Hour,O3 1st Max Value,O3 1st Max Hour,SO2 1st Max Value,SO2 1st Max Hour,CO 1st Max Value,CO 1st Max Hour
0,1,2000-01-01,California,San Diego,Chula Vista,"80 E. 'J' ST., CHULA VISTA",10.913043,"(20.0, 0)",0.031625,"(0.043, 9)",...,0.603382,"(1.1, 8)",20.0,0.0,0.043,9.0,3.0,4.0,1.1,8.0
1,1,2000-01-02,California,San Diego,Chula Vista,"80 E. 'J' ST., CHULA VISTA",10.869565,"(35.0, 18)",0.026833,"(0.04, 9)",...,0.560145,"(1.4, 18)",35.0,18.0,0.04,9.0,2.0,2.0,1.4,18.0
2,1,2000-01-03,California,San Diego,Chula Vista,"80 E. 'J' ST., CHULA VISTA",27.782609,"(45.0, 17)",0.011333,"(0.022000000000000002, 9)",...,1.174003,"(2.8, 20)",45.0,17.0,0.022,9.0,3.0,20.0,2.8,20.0
3,1,2000-01-04,California,San Diego,Chula Vista,"80 E. 'J' ST., CHULA VISTA",33.869565,"(58.0, 17)",0.009417,"(0.021, 8)",...,1.249003,"(2.6, 20)",58.0,17.0,0.021,8.0,5.0,16.0,2.6,20.0
4,1,2000-01-05,California,San Diego,Chula Vista,"80 E. 'J' ST., CHULA VISTA",34.181818,"(50.0, 18)",0.011875,"(0.027000000000000003, 10)",...,1.588258,"(4.0, 7)",50.0,18.0,0.027,10.0,4.0,7.0,4.0,7.0
5,1,2000-01-06,California,San Diego,Chula Vista,"80 E. 'J' ST., CHULA VISTA",31.318182,"(51.0, 20)",0.011292,"(0.024, 8)",...,1.389584,"(3.4, 8)",51.0,20.0,0.024,8.0,4.0,8.0,3.4,8.0
6,1,2000-01-07,California,San Diego,Chula Vista,"80 E. 'J' ST., CHULA VISTA",30.608696,"(43.0, 8)",0.0115,"(0.025, 9)",...,1.176359,"(2.4, 7)",43.0,8.0,0.025,9.0,12.0,11.0,2.4,7.0
7,1,2000-01-08,California,San Diego,Chula Vista,"80 E. 'J' ST., CHULA VISTA",31.695652,"(48.0, 11)",0.008083,"(0.018000000000000002, 9)",...,1.488497,"(3.3, 19)",48.0,11.0,0.018,9.0,6.0,10.0,3.3,19.0
8,1,2000-01-09,California,San Diego,Chula Vista,"80 E. 'J' ST., CHULA VISTA",33.043478,"(57.0, 18)",0.013417,"(0.03, 10)",...,1.314221,"(2.6, 20)",57.0,18.0,0.03,10.0,5.0,12.0,2.6,20.0
9,1,2000-01-10,California,San Diego,Chula Vista,"80 E. 'J' ST., CHULA VISTA",35.304348,"(50.0, 19)",0.015292,"(0.037000000000000005, 10)",...,1.372192,"(2.4, 19)",50.0,19.0,0.037,10.0,4.0,10.0,2.4,19.0


In [135]:
# Drop the tibble-columns, they are no longer needed
pollution_dedup.drop(['NO2 Zipped', 'O3 Zipped', 'SO2 Zipped', 'CO Zipped'], axis=1, inplace=True)
pollution_dedup.reset_index(inplace=True)

In [138]:
# --- Recalculate the AQI ---
# This currently fails because there are still negative values in the mean-entries

# import aqi
# pollution_dedup.iloc[102281]
# pollution_dedup.head(200000).apply(lambda x: aqi.to_iaqi(aqi.POLLUTANT_SO2_1H, x['SO2 Mean']), axis=1)

index                                                 102281
Site Num                                                   9
Date Local                                        2011-08-06
State                                               Virginia
County                                       Alexandria City
City                                              Alexandria
Address              517 N SAINT ASAPH ST, ALEXANDRIA HEALTH
NO2 Mean                                             3.47917
O3 Mean                                             0.030958
SO2 Mean                                            -1.05417
CO Mean                                              0.30625
NO2 1st Max Value                                          5
NO2 1st Max Hour                                          16
O3 1st Max Value                                        0.04
O3 1st Max Hour                                            9
SO2 1st Max Value                                       -0.4
SO2 1st Max Hour        

In [142]:
pollution[pollution['CO Mean'] < 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,State Code,County Code,Address,State,County,City,NO2 Units,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,...,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI,NO2 Zipped,O3 Zipped,SO2 Zipped,CO Zipped
Site Num,Date Local,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2009,2011-03-28,6,85,22601 Voss Ave,California,Santa Clara,Cupertino,Parts per billion,4.838095,14.1,23,...,1.0,Parts per million,-0.015385,0.000,11,,"(14.1, 23)","(0.04, 9)","(1.1, 0)","(0.0, 11)"
2009,2011-03-28,6,85,22601 Voss Ave,California,Santa Clara,Cupertino,Parts per billion,4.838095,14.1,23,...,,Parts per million,-0.015385,0.000,11,,"(14.1, 23)","(0.04, 9)","(1.1, 8)","(0.0, 11)"
2009,2011-03-29,6,85,22601 Voss Ave,California,Santa Clara,Cupertino,Parts per billion,6.704762,14.0,6,...,1.0,Parts per million,-0.019048,0.100,4,,"(14.0, 6)","(0.035, 10)","(1.0, 9)","(0.1, 4)"
2009,2011-03-29,6,85,22601 Voss Ave,California,Santa Clara,Cupertino,Parts per billion,6.704762,14.0,6,...,1.0,Parts per million,-0.009524,0.100,7,1.0,"(14.0, 6)","(0.035, 10)","(1.0, 9)","(0.1, 7)"
2009,2011-03-29,6,85,22601 Voss Ave,California,Santa Clara,Cupertino,Parts per billion,6.704762,14.0,6,...,,Parts per million,-0.019048,0.100,4,,"(14.0, 6)","(0.035, 10)","(0.6, 11)","(0.1, 4)"
2009,2011-03-29,6,85,22601 Voss Ave,California,Santa Clara,Cupertino,Parts per billion,6.704762,14.0,6,...,,Parts per million,-0.009524,0.100,7,1.0,"(14.0, 6)","(0.035, 10)","(0.6, 11)","(0.1, 7)"
2009,2011-03-30,6,85,22601 Voss Ave,California,Santa Clara,Cupertino,Parts per billion,5.834783,13.0,7,...,0.0,Parts per million,-0.039130,0.000,5,,"(13.0, 7)","(0.034, 10)","(0.5, 15)","(0.0, 5)"
2009,2011-03-30,6,85,22601 Voss Ave,California,Santa Clara,Cupertino,Parts per billion,5.834783,13.0,7,...,0.0,Parts per million,-0.041667,0.000,8,0.0,"(13.0, 7)","(0.034, 10)","(0.5, 15)","(0.0, 8)"
2009,2011-03-30,6,85,22601 Voss Ave,California,Santa Clara,Cupertino,Parts per billion,5.834783,13.0,7,...,,Parts per million,-0.039130,0.000,5,,"(13.0, 7)","(0.034, 10)","(0.4, 17)","(0.0, 5)"
2009,2011-03-30,6,85,22601 Voss Ave,California,Santa Clara,Cupertino,Parts per billion,5.834783,13.0,7,...,,Parts per million,-0.041667,0.000,8,0.0,"(13.0, 7)","(0.034, 10)","(0.4, 17)","(0.0, 8)"


In [143]:
# --- Missing values analysis ---

# Get all rows that contain at least one NaN
missings = pollution_dedup[pollution_dedup.isna().any(axis=1)]

# How many observations with missings are there?
print("Number of rows with missings: " + str(len(missings)))
print("Number of rows in the dataset: " + str(len(pollution_dedup)))

# After merging the duplicate rows, there seem to be no missings left
axis_counts = missings.count(axis = 0).rsub(len(missings))
axis_counts

Number of rows with missings: 0
Number of rows in the dataset: 412856


index                0
Site Num             0
Date Local           0
State                0
County               0
City                 0
Address              0
NO2 Mean             0
O3 Mean              0
SO2 Mean             0
CO Mean              0
NO2 1st Max Value    0
NO2 1st Max Hour     0
O3 1st Max Value     0
O3 1st Max Hour      0
SO2 1st Max Value    0
SO2 1st Max Hour     0
CO 1st Max Value     0
CO 1st Max Hour      0
dtype: int64