Goal:

Narrow down to only these sites that are valueable from analysis perspective

# Site selection

Sites with 4 or more years of complete data are picked. 

In [1]:
from os.path import join, basename, splitext
from glob import glob
from dask import dataframe as dd
from matplotlib import rcParams
import pandas as pd
import dask
from collections import Counter
import pickle


from deep_aqi import ROOT


pd.set_option('max_columns', 50)
pd.set_option('max_rows', 25)

In [2]:
def site_code_test(table):
    """When files were loaded for the first time, integers were assumed missing and they were transfomed to 
    float, let's check if its consistent across all files, it could make it hard to look for same sites
    across many files otherwise."""
    if table.SiteCode.str.contains('.').all():
        pass
        #         print('All SiteCodes have "." contained inside.')
    else:
        raise ValueError('Not all SiteCodes have "." contained inside!')

In [3]:
def check_coverage(parameter):
    MINIMUM_YEAR_COUNT = 4
    files = glob(f'{INTERIM_DATA}/*.parquet', recursive=True)
    files = [file for file in files if parameter in file]

    sites = []
    for file in files:
        df = dd.read_parquet(file)
        site_code_test(df)
        sites_ = df.SiteCode.unique().compute()
        sites.extend(sites_)
    return set([site for site, count in Counter(sites).items() if count >= MINIMUM_YEAR_COUNT])

In [4]:
INTERIM_DATA = join(ROOT, 'data', 'interim')

In [5]:
files = glob(f'{INTERIM_DATA}/*.parquet', recursive=True)

Counter([basename(file).split('_')[-1].split('.')[0] for file in files]), Counter([basename(file).split('_')[1] for file in files])

(Counter({'2014': 12,
          '2010': 12,
          '2016': 12,
          '2017': 12,
          '2012': 12,
          '2015': 12,
          '2011': 12,
          '2013': 12}),
 Counter({'WIND': 8,
          '44201': 8,
          'PRESS': 8,
          '88502': 8,
          '81102': 8,
          '42602': 8,
          'TEMP': 8,
          '42401': 8,
          '42101': 8,
          'RH': 8,
          'SPEC': 8,
          '88101': 8}))

In [6]:
wind_sites = check_coverage('WIND')
temp_sites = check_coverage('TEMP')
press_sites = check_coverage('PRESS')
rhdp_sites = check_coverage('RH_DP') 

ozone_sites = check_coverage('44201')
sulfur_sites = check_coverage('42401') 
carbon_sites = check_coverage('42101') 
nitro_sites = check_coverage('42602')  

pm25frm_sites = check_coverage('88101') 
pm10_sites = check_coverage('81102') 
pm25_sites = check_coverage('88502')  
spec_sites = check_coverage('SPEC')  

In [7]:
weather_sites = wind_sites.intersection(temp_sites).intersection(press_sites).intersection(rhdp_sites)

In [8]:
ozone = weather_sites.intersection(ozone_sites)
len(ozone)

58

In [9]:
sulfur = weather_sites.intersection(sulfur_sites)
len(sulfur)

44

In [10]:
carbon = weather_sites.intersection(carbon_sites)
len(carbon)

41

In [11]:
nitro = weather_sites.intersection(nitro_sites)
len(nitro)

41

In [12]:
pm25frm = weather_sites.intersection(pm25frm_sites)
len(pm25frm)

39

In [13]:
pm10 = weather_sites.intersection(pm10_sites)
len(pm10)

28

In [14]:
pm25 = weather_sites.intersection(pm25_sites)
len(pm25)

22

In [15]:
spec = weather_sites.intersection(spec_sites)
len(spec)

3

In [16]:
available_sites = ozone.union(sulfur).union(carbon).union(nitro).union(pm25frm).union(pm10).union(pm25).union(spec)

In [17]:
len(available_sites)

100

In [11]:
# with open('available_sites.p', 'wb') as file:
#     pickle.dump(available_sites, file)

In [18]:
ozone

{'Alabama_Jefferson_23.0',
 'California_Fresno_5001.0',
 'California_Kern_6001.0',
 'California_Sacramento_11.0',
 'California_Santa Barbara_2011.0',
 'Colorado_Rio Blanco_5.0',
 'Colorado_Rio Blanco_6.0',
 'District Of Columbia_District of Columbia_43.0',
 'Indiana_Marion_78.0',
 'Iowa_Scott_15.0',
 'Kentucky_Edmonson_501.0',
 'Louisiana_East Baton Rouge_13.0',
 'Louisiana_East Baton Rouge_9.0',
 'Maryland_Baltimore_3001.0',
 'Maryland_Dorchester_4.0',
 'Maryland_Garrett_2.0',
 "Maryland_Prince George's_30.0",
 'Massachusetts_Suffolk_42.0',
 'Michigan_Kent_20.0',
 'Michigan_Wayne_1.0',
 'Missouri_St. Louis City_85.0',
 'Nebraska_Douglas_19.0',
 'Nevada_Clark_2002.0',
 'Nevada_Clark_540.0',
 'Nevada_Clark_75.0',
 'New Hampshire_Hillsborough_5001.0',
 'New Hampshire_Rockingham_18.0',
 'New Jersey_Essex_3.0',
 'New Mexico_Bernalillo_23.0',
 'New York_Chautauqua_6.0',
 'New York_Herkimer_5.0',
 'New York_Monroe_1007.0',
 'North Carolina_Mecklenburg_41.0',
 'North Dakota_Burke_4.0',
 'Nort