# Data Exploration

In [1]:
import numpy as np
import pandas as pd
import requests
import os
import sys
from zipfile import ZipFile

### Download Test Files

In [2]:
def get_file_from_url(url):
    # check for file name from url, otherwise download programatically
    file_name = url.split('/')[-1]
    if not os.path.isfile(file_name):
        print(f"file {file_name} doesn't exist")
        print(f"downloading {file_name}\n")
        with open(file_name, mode = 'wb') as file:
            file.write(requests.get(url).content)
    else:
        print(f'file {file_name} already exists\n')

In [3]:
get_file_from_url('https://aqs.epa.gov/aqsweb/airdata/hourly_TEMP_2016.zip')
get_file_from_url('https://aqs.epa.gov/aqsweb/airdata/hourly_42101_2016.zip')

file hourly_TEMP_2016.zip already exists

file hourly_42101_2016.zip already exists



Loading Parameters:
+ 'Qualifier' is getting auto-detected as `int`, but then throwing an error because it contains strings
+ Dates and Times are loaded as objects so they can be concatenated to a single datetime.

### Temperature

In [4]:
temp_df = pd.read_csv('hourly_TEMP_2016.zip', dtype={'Qualifier': 'object',
                                                     'Date Local': 'object',
                                                     'Time Local': 'object',
                                                     'Date GMT': 'object',
                                                     'Time GMT': 'object'})

Look at current data schema:

In [5]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7141394 entries, 0 to 7141393
Data columns (total 24 columns):
State Code             int64
County Code            int64
Site Num               int64
Parameter Code         int64
POC                    int64
Latitude               float64
Longitude              float64
Datum                  object
Parameter Name         object
Date Local             object
Time Local             object
Date GMT               object
Time GMT               object
Sample Measurement     float64
Units of Measure       object
MDL                    float64
Uncertainty            float64
Qualifier              object
Method Type            object
Method Code            int64
Method Name            object
State Name             object
County Name            object
Date of Last Change    object
dtypes: float64(5), int64(6), object(13)
memory usage: 1.3+ GB


can go to own table:
+ Method Type and Method Code, Method Name
+ State Name and State Code
+ County Code and County Name
+ Site Num, Latitude, Longitude
+ Parameter Name, Parameter Code

briefly check fields to see what has multiple/unique values:

In [6]:
temp_df['Units of Measure'].drop_duplicates()

0    Degrees Fahrenheit
Name: Units of Measure, dtype: object

The rows seem to have the same unit of measure.

In [7]:
temp_df[['Site Num', 'Latitude', 'Longitude']].drop_duplicates().sort_values('Site Num').head(10)

Unnamed: 0,Site Num,Latitude,Longitude
6155548,1,43.466111,-88.621111
1992474,1,33.588545,-84.069608
690117,1,37.64571,-118.96652
1940948,1,33.582044,-82.131249
1932932,1,30.0925,-84.161111
3146392,1,42.22862,-83.2082
196968,1,38.20185,-120.680277
3884438,1,40.515262,-74.806671
3128917,1,46.288877,-85.950227
3291117,1,37.69,-94.035


Sites seem to not take the readings in the same location.

In [8]:
temp_df['Qualifier'].drop_duplicates()

0          NaN
247307      SX
258514       1
499046       2
2460823     IM
5761358      3
Name: Qualifier, dtype: object

In [9]:
temp_df['Parameter Name'].value_counts()

Outdoor Temperature    7141394
Name: Parameter Name, dtype: int64

In [10]:
temp_df[['Method Code', 'Method Type', 'Method Name']].drop_duplicates()

Unnamed: 0,Method Code,Method Type,Method Name
0,61,Non-FRM,Instrumental - Met One 083D
16334,41,Non-FRM,INSTRUMENTAL - ELEC. OR MACH. AVG. LEVEL 1
42482,40,Non-FRM,INSTRUMENTAL - ELECTRONIC OR MACHINE AVG.
165206,59,Non-FRM,Instrumental - Vaisala HMP 155
545237,22,Non-FRM,INSTRUMENTAL - SPOT READING LEVEL 2
707645,50,Non-FRM,Instrumental - Visual average
742738,63,Non-FRM,Instrumental - Rotronic HC2-S3
867893,42,Non-FRM,INSTRUMENTAL - ELEC. OR MACH. AVG. LEVEL 2
1891699,60,Non-FRM,Instrumental - Vaisala 435C RH/AT Sensor
2012826,20,Non-FRM,INSTRUMENTAL - SPOT READING


In [11]:
temp_df[['State Code', 'State Name']].drop_duplicates().sort_values('State Code', ascending=False).head()

Unnamed: 0,State Code,State Name
7129359,80,Country Of Mexico
7121848,72,Puerto Rico
6300719,56,Wyoming
6133256,55,Wisconsin
6115908,54,West Virginia


### CO

In [12]:
co_df = pd.read_csv('hourly_42101_2016.zip', dtype={'Qualifier': 'object',
                                                     'Date Local': 'object',
                                                     'Time Local': 'object',
                                                     'Date GMT': 'object',
                                                     'Time GMT': 'object'})

Look at current data schema:

In [13]:
co_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2425038 entries, 0 to 2425037
Data columns (total 24 columns):
State Code             int64
County Code            int64
Site Num               int64
Parameter Code         int64
POC                    int64
Latitude               float64
Longitude              float64
Datum                  object
Parameter Name         object
Date Local             object
Time Local             object
Date GMT               object
Time GMT               object
Sample Measurement     float64
Units of Measure       object
MDL                    float64
Uncertainty            float64
Qualifier              object
Method Type            object
Method Code            int64
Method Name            object
State Name             object
County Name            object
Date of Last Change    object
dtypes: float64(5), int64(6), object(13)
memory usage: 444.0+ MB


As expected, the CO data schema looks the same.
There are fewer values for CO versus Temperature.

In [14]:
co_df['Units of Measure'].drop_duplicates()

0    Parts per million
Name: Units of Measure, dtype: object

Similar to temperature, all rows seem to be the same unit of measure.

In [15]:
co_df[['Site Num', 'Latitude', 'Longitude']].drop_duplicates().sort_values('Site Num').head(10)

Unnamed: 0,Site Num,Latitude,Longitude
1019455,1,30.0925,-84.161111
1944960,1,39.92002,-77.30968
2367839,1,43.466111,-88.621111
415367,1,37.97231,-122.520004
1343132,1,42.22862,-83.2082
553211,1,34.89398,-117.024804
520827,2,38.71209,-121.38109
267048,2,37.360684,-118.330783
292141,2,34.1365,-117.92391
2221549,2,40.253611,-111.663056


Similar to temperature, the measurement locations are different even at the same site.

### Combining the Data

After attempting to join/merge the data sources, it became obvious that a bigger bucket for aggregation was needed because of multiple listings, multiple lattitude/longitude pairs per site, or more sites for some readings and not others. Therefore the data will be grouped by state and day (loacl), then statistics of high, low, average, count, and standard deviation will be generated.

Checking the potential list of tables for the schema from before:
+ ~~Method Type and Method Code, Method Name~~ if grouped, the measurement methods aren't all the same
+ State Name and State Code
+ ~~County Code and County Name~~ if grouped by state, the counties won't be the same
+ ~~Site Num, Latitude, Longitude~~ if grouped, the sites and lat,long data won't be the same
+ ~~Parameter Name, Parameter Code~~ this will be a column in the fact table.

Unfortunately, the grouping eliminates a lot of the potential on making lots of tables. The only real potential is to make some summary statistic tables and then some pivot tables for timeseries trends, which was the ultimate goal anyways.

In [16]:
# check column names
temp_df.columns.to_list()

['State Code',
 'County Code',
 'Site Num',
 'Parameter Code',
 'POC',
 'Latitude',
 'Longitude',
 'Datum',
 'Parameter Name',
 'Date Local',
 'Time Local',
 'Date GMT',
 'Time GMT',
 'Sample Measurement',
 'Units of Measure',
 'MDL',
 'Uncertainty',
 'Qualifier',
 'Method Type',
 'Method Code',
 'Method Name',
 'State Name',
 'County Name',
 'Date of Last Change']

Since aggregation is by state and day, all that's needed is 'Date Local', 'State Name', and 'Sample Measurement'.

In [17]:
# drop all columns but 'State Name', 'Date Local', 'Sample Measurement'
temp_df = temp_df[['State Name', 'Date Local', 'Sample Measurement']]
# rename columns to one word for easier reference
temp_df.rename(columns={"State Name": "state",
                        "Date Local": "date",
                        "Sample Measurement": "temperature"},
               inplace=True)

In [18]:
# group by and aggregate
# note: the aggregate function `agg` ended up being much faster than the `.describe()` method.
temp_data_df = temp_df.groupby(['state', 'date'], as_index=False)['temperature'] \
                      .agg([pd.Series.count, np.min, np.max, np.mean, np.std]) \
                      .reset_index() \
                      .rename(columns={"amin": "min",
                                       "amax": "max"}) \
                      .astype({"count": "int64"})

In [19]:
temp_data_df.head()

Unnamed: 0,state,date,count,min,max,mean,std
0,Alabama,2016-01-01,48,35.4,43.5,38.925,2.270299
1,Alabama,2016-01-02,48,32.9,46.4,38.339583,4.060539
2,Alabama,2016-01-03,48,32.0,50.5,40.785417,6.023818
3,Alabama,2016-01-04,48,30.2,43.0,36.147917,3.5764
4,Alabama,2016-01-05,48,29.5,47.7,37.666667,5.95327


Now comes the implementation:
+ downloading data
+ loading into pandas
+ generating summary statistics
+ uploading to tables
+ generating pivot tables

In [21]:
# clean up test files
os.remove('hourly_TEMP_2016.zip')
os.remove('hourly_42101_2016.zip')