In [1]:
import pandas as pd
import altair as alt

In [2]:
data_set = "42602"

dtypes = {
    "State Name": "category",
    "County Name": "category",
    "Site Num": "category"
}

df = pd.read_csv(
    "./data/AirDataEPA/hourly_"+ data_set + "_2018.csv",
    parse_dates = {"tstamp":["Date Local", "Time Local"]},
    dtype = dtypes,
    usecols = list(dtypes) + ["Sample Measurement","Date Local","Time Local"]
).rename(
    columns = {
        "State Name": "state",
        "County Name": "county",
        "Site Num": "site",
        "Sample Measurement": "measure"
    }
).set_index('tstamp')

#### Adding a convenience column with county + site

In [4]:
df['county_site'] = df['county'].astype('str') + " " + df['site'].astype('str')
df.head()

Unnamed: 0_level_0,site,measure,state,county,county_site
tstamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:00:00,23,2.4,Alabama,Jefferson,Jefferson 0023
2018-01-01 01:00:00,23,2.3,Alabama,Jefferson,Jefferson 0023
2018-01-01 02:00:00,23,2.2,Alabama,Jefferson,Jefferson 0023
2018-01-01 03:00:00,23,2.7,Alabama,Jefferson,Jefferson 0023
2018-01-01 04:00:00,23,2.4,Alabama,Jefferson,Jefferson 0023


#### 3.5 million rows

In [14]:
df.shape

(3531277, 5)

#### More than 5000 rows need data out of the HTML Altair generates

See solutions to plotting large data sets: https://altair-viz.github.io/user_guide/faq.html#maxrowserror-how-can-i-plot-large-datasets

Here I'm using the data_server solution: https://pypi.org/project/altair-data-server/

```
pip install altair_data_server
```

In [5]:
alt.data_transformers.enable('data_server')

DataTransformerRegistry.enable('data_server')

In [17]:
grp = df.groupby(['county_site']).agg({'measure':'mean'}).reset_index()
print(grp.shape)
grp.head()

(457, 2)


Unnamed: 0,county_site,measure
0,Adams 0001,1.12418
1,Adams 3001,15.664392
2,Alameda 0007,8.866343
3,Alameda 0009,9.986318
4,Alameda 0011,12.317494


In [24]:
df_reset = df.reset_index()
grp_reset = df.groupby(['county_site']).agg({'measure':'mean'}).reset_index()

### Almost immediate rendering

In [29]:
alt.Chart(grp_reset).mark_bar().encode(x='measure:Q',y='county_site:N')

### Takes about 50 seconds on my desktop machine

In [28]:
alt.Chart(df_reset).mark_bar().encode(x='mean(measure):Q',y='county_site:N')