#### Import some required libraries

In [None]:
import pandas as pd
import zipfile
import numpy as np

#### Read in the source data

In [None]:
df0 = pd.read_csv('../data/archive.zip', compression='zip')
df0.head()

#### Drop some redundant columns
Also parse the date

In [None]:
df1 = df0.drop(columns=['Unnamed: 0', 'State Code', 'County Code', 'Site Num', 'NO2 Units', 'O3 Units', 'SO2 Units', 'CO Units'])
df1['Date Local'] = pd.to_datetime(df1['Date Local'])
df1.info()

In [None]:
n = 0

class MeasureAggregator:
    def __init__(self, measure_column, hour_column=None):
        self.measure = measure_column
        self.hour_column = hour_column
        self.worst_value = None
        self.worst_hour = None

    def reset(self):
        self.worst_value = None
        self.worst_hour = None

    def consider(self, row):
        value = row[self.measure]
        if not np.isnan(value):
            if self.worst_value is None or value > self.worst_value:
                self.worst_value = value
                self.worst_row = row
                if self.hour_column is not None:
                    hour = row[self.hour_column]
                    self.worst_hour = hour

    def update(self, dict):
        if self.worst_value is not None:
            dict[self.measure] = self.worst_value
        else:
            dict[self.measure] = np.nan
        if self.hour_column is not None:
            dict[self.hour_column] = self.worst_hour


measures = [
    MeasureAggregator('NO2 Mean'),
    MeasureAggregator('NO2 1st Max Value', 'NO2 1st Max Hour'),
    MeasureAggregator('NO2 AQI'),
    MeasureAggregator('O3 Mean'),
    MeasureAggregator('O3 1st Max Value', 'O3 1st Max Hour'),
    MeasureAggregator('O3 AQI'),
    MeasureAggregator('SO2 Mean'),
    MeasureAggregator('SO2 1st Max Value', 'SO2 1st Max Hour'),
    MeasureAggregator('SO2 AQI'),
    MeasureAggregator('CO Mean'),
    MeasureAggregator('CO 1st Max Value', 'CO 1st Max Hour'),
    MeasureAggregator('CO AQI')
]


def aggregate_by_worst_reading(group):
    global n  # Show progress as this process can be slow
    n += 1
    if n % 10000 == 0:
        print(n)

    for measure in measures:
        measure.reset()
    out = {}
    # Copy the non-aggregated fields from the first row in the group
    out['Address'] = group.iloc[0]['Address']
    out['State'] = group.iloc[0]['State']
    out['County'] = group.iloc[0]['County']
    out['City'] = group.iloc[0]['City']
    out['Date Local'] = group.iloc[0]['Date Local']
    for _, row in group.iterrows():
        for measure in measures:
            measure.consider(row)
    for measure in measures:
        measure.update(out)
    return pd.Series(out)


In [None]:
df2 = df1.groupby(['Address', 'Date Local']).apply(aggregate_by_worst_reading, include_groups=True)

In [43]:
df2.to_csv('../data/cleaned_pollution_data.zip', index=False, compression='zip')