In [235]:
import pandas as pd
import altair as alt

In [257]:
url = "https://github.com/alanjones2/uk-historical-weather/raw/main/archive/Heathrow-to-2023.csv"

weather = pd.read_csv(
    url, 
    usecols=['Date','Month','Year','Tmax','Tmin','Tmean','Rain','Sun','Date'],
    parse_dates=['Date']
)

In [258]:
weather

Unnamed: 0,Year,Month,Tmax,Tmin,Rain,Sun,Date,Tmean
0,1948,1,8.9,3.3,85.0,,1948-01-01,6.10
1,1948,2,7.9,2.2,26.0,,1948-02-01,5.05
2,1948,3,14.2,3.8,14.0,,1948-03-01,9.00
3,1948,4,15.4,5.1,35.0,,1948-04-01,10.25
4,1948,5,18.1,6.9,57.0,,1948-05-01,12.50
...,...,...,...,...,...,...,...,...
908,2023,9,24.4,14.7,59.4,167.9,2023-09-01,19.55
909,2023,10,18.1,10.2,116.8,112.9,2023-10-01,14.15
910,2023,11,11.8,5.4,78.4,74.9,2023-11-01,8.60
911,2023,12,10.9,5.9,76.4,21.0,2023-12-01,8.40


In [259]:
weather_2022 = (
    weather
    [
        weather['Date'].dt.year.eq(2022)
    ]
)

## Basic plots

An altair chart begins with creating a `alt.Chart` object initiated with the data. Then technically all it needs is a type of mark. But without an encoding, there's not much to see.

In [260]:
(
    alt.Chart(weather_2022)
    .mark_point()
)

Encoding columns to visual channels (like the x and y axes) creates a chart

In [261]:
(
    alt.Chart(weather_2022)
    .mark_point()
    .encode(
        x='Month',
        y='Tmax'
    )
)

In [262]:
(
    alt.Chart(weather_2022)
    .mark_line()
    .encode(
        x='Month',
        y='Tmax'
    )
)

In [12]:
(
    alt.Chart(weather)
    .mark_point()
    .encode(
        x='Tmin',
        y='Tmax'
    )
)

In [None]:
(
    alt.Chart(weather)
    .mark_point()
    .encode(
        x='Sun',
        y='Rain'
    )
)

In [268]:
(
    alt.Chart(weather)
    .mark_point()
    .encode(
        x='Sun',
        y='Tmean'
    )
)

In [269]:
(
    alt.Chart(weather_2022)
    .mark_bar()
    .encode(
        x='Month',
        y='Rain'
    )
)

## Date time summarization

Date-time truncation specifications can be wrapped around column specifications for neater charts or for aggregation:

In [270]:
(
    alt.Chart(weather_2022)
    .mark_bar()
    .encode(
        x='yearmonth(Date)',
        y='Rain'
    )
)

(changing the data type from time to ordinal with `:O` creates a discrete legend)

In [280]:
(
    weather
    [
        weather['Year'] > 2014
    ]
    .pipe(alt.Chart)
    .mark_line()
    .encode(
        x='month(Date)',
        y='Tmax',
        color='year(Date):O'
    )   
)

### Combine and layer

Multiple permutations of chats can be created (starting from the same base for shared components), then faceted or layered.

In [14]:
(
    alt.Chart(weather_2022)
    .mark_line()
    .encode(
        x='yearmonth(Date)',
        y='Tmax'
    )
)

Start with a base chart and add marks and encodings:

In [15]:
base = (
    alt.Chart(weather_2022)
    .mark_line()
    .encode(
        x='yearmonth(Date)'
    )
)

tmax = (
    base
    .mark_line(
        color='orange'
    )
    .encode(
        y='Tmax'
    )
)

tmin = (
    base
    .encode(
        y='Tmin'
    )
)

In [16]:
tmin

Pair side-by-side with `|`

In [17]:
tmax | tmin

Match the scales with `resolve_scale()`

In [18]:
(tmax | tmin).resolve_scale(y='shared')

Layer charts with `+`

In [19]:
tmax + tmin

In [20]:
# (
#     alt.Chart(weather_2022)
#     .mark_line()
#     .encode(
#         x='yearmonth(Date)',
#         y=alt.Y(alt.repeat('layer'), type='quantitative'),
#         color=alt.ColorDatum(alt.repeat('layer'))
#     )
#     .repeat(layer=["Tmin", "Tmax"])    
# )

Specifying a column facets (repeats) the chart for each value in that column

In [None]:
(
    alt.Chart(weather)
    .mark_line()
    .encode(
        x='year(Date)',
        y='Tmax',
        column='month(Date)'
    )   
)

### Aggregate

Aggregation functions can be wrapped around the column specification to summarize data in each group:

In [21]:
(
    alt.Chart(weather)
    .mark_line()
    .encode(
        x='month(Date)',
        y='mean(Tmax)'
    )
)

In [282]:
(
    alt.Chart(weather)
    .mark_bar()
    .encode(
        x='year(Date)',
        y='max(Rain)'
    )
)

#### Smoothed lines

the `interpolate` option smooths lines through various computations methods

In [26]:
(
    weather
    [
        weather['Year'] > 2018
    ]
    .pipe(alt.Chart)
    .properties(width=500)
    .mark_line()
    .encode(
        x='yearmonth(Date)',
        y='Tmax',
    )
)

In [286]:
(
    weather
    [
        weather['Year'] > 2018
    ]
    .pipe(alt.Chart)
    .properties(width=500)
    .mark_line(
        interpolate='natural'
    )
    .encode(
        x='yearmonth(Date)',
        y='Tmax',
    )
)

In [287]:
(
    weather
    [
        weather['Year'] > 2018
    ]
    .pipe(alt.Chart)
    .properties(width=500)
    .mark_line(
        interpolate='monotone',
    )
    .encode(
        x='yearmonth(Date)',
        y='Tmax',
    )
)

### Melt to long

altair needs data in *long* form (also called *tidy* form), where each row is a single observation and columns specify the categories. Use `melt()` to rearrange wide-form data to long. (If aggregating, applying a `groupby()` then a `reset_index()` also yields long-form data.)

In [289]:
weather_long = (
    weather
    [['Date','Tmin','Tmax']]
    .melt(
        id_vars=['Date'],
        value_vars=['Tmin','Tmax'],
        value_name='temperature'
    )
)

weather_long

Unnamed: 0,Date,variable,temperature
0,1948-01-01,Tmin,3.3
1,1948-02-01,Tmin,2.2
2,1948-03-01,Tmin,3.8
3,1948-04-01,Tmin,5.1
4,1948-05-01,Tmin,6.9
...,...,...,...
1821,2023-09-01,Tmax,24.4
1822,2023-10-01,Tmax,18.1
1823,2023-11-01,Tmax,11.8
1824,2023-12-01,Tmax,10.9


In [292]:
(
    alt.Chart(weather_long)
    .mark_line()
    .encode(
        x='month(Date):O',
        y='mean(temperature)',
        color='variable',
    )
)

In [293]:
(
    alt.Chart(weather_long)
    .mark_bar()
    .encode(
        x='month(Date):O',
        y='mean(temperature)',
        color='variable',
        xOffset='variable'
    )
)

## Open Streets

In [294]:
open_streets = pd.read_csv(
    '../Data/Source Data/open_streets.csv',
    parse_dates=['report_date']
)

## Basic plots

In [296]:
(
    alt.Chart(open_streets)
    .mark_bar()
    .encode(
        x='report_date',
        y='count(open_street)'
    )
)

In [None]:
open_street_5_ave_park_slope = (
    open_streets
    [
        open_streets['open_street'] == '5th Avenue - Park Slope'
    ]
)

In [None]:
(
    alt.Chart(open_street_5_ave_park_slope)
    .mark_line()
    .encode(
        x='report_date',
        y='length_mi'
    )
)

In [None]:
(
    alt.Chart(open_street_5_ave_park_slope)
    .mark_line()
    .encode(
        x='report_date',
        y='total_weekly_hours'
    )
)

In [297]:
(
    open_streets
    .groupby('report_date')
    ['open_street']
    .nunique()
    .rename('count_open_streets')
    .reset_index()
    .pipe(alt.Chart)
    .mark_line()
    .encode(
        x='report_date',
        y='count_open_streets'
    )
)

In [None]:
(
    alt.Chart(open_streets)
    .mark_point()
    .encode(
        x='total_weekly_hours',
        y='length_mi',
        color='borough'
    )
)

In [301]:
(
    open_streets
    .groupby('category')
    ['total_weekly_hours']
    .mean()
    .rename('avg_total_weekly_hours')
    .reset_index()
    .pipe(alt.Chart)
    .mark_bar()
    .encode(
        x='category',
        y='avg_total_weekly_hours',
    )
)

In [302]:
(
    open_streets
    .groupby('sponsor_type')
    ['total_weekly_hours']
    .mean()
    .rename('avg_total_weekly_hours')
    .reset_index()
    .pipe(alt.Chart)
    .mark_bar()
    .encode(
        x='sponsor_type',
        y='avg_total_weekly_hours',
    )
)

Add additional parameters to an encoding (like sorting values along the x-axis) by wrapping the specification in an `alt.X()` and adding additional arguments, like `sort=` 

In [303]:
(
    open_streets
    .groupby('sponsor_type')
    ['total_weekly_hours']
    .mean()
    .rename('avg_total_weekly_hours')
    .reset_index()
    .pipe(alt.Chart)
    .mark_bar()
    .encode(
        x=alt.X(
            'sponsor_type',
            sort='-y'
        ),
        y='avg_total_weekly_hours',
    )
)

or change the axis text with `title=`, or turn on gridlines with `axis=alt.Axis(grid=True)`

There are many, many options that can be passed to [`alt.Scale()`](https://altair-viz.github.io/user_guide/generated/core/altair.Scale.html) and [`alt.Axis()`](https://altair-viz.github.io/user_guide/generated/core/altair.Axis.html)` to customize the view

In [309]:
(
    open_streets
    .groupby('sponsor_type')
    ['total_weekly_hours']
    .mean()
    .rename('avg_total_weekly_hours')
    .reset_index()
    .pipe(alt.Chart)
    .mark_bar()
    .encode(
        x=alt.X(
            'sponsor_type',
            sort='-y',
            axis=alt.Axis(grid=True)
        ),
        y=alt.Y(
            'avg_total_weekly_hours',
            title='Weekly hours'
        )
    )
)

In [310]:
(
    open_streets
    .groupby(['report_date','borough'])
    ['total_weekly_hours']
    .mean()
    .rename('avg_total_weekly_hours')
    .reset_index()
    .pipe(alt.Chart)
    .mark_line()
    .encode(
        x='report_date',
        y='avg_total_weekly_hours',
        color='borough',
    )


)

In [313]:
(
    open_streets
    .groupby(['report_date','borough'])
    ['length_mi']
    .sum()
    .rename('total_miles')
    .reset_index()
    .pipe(alt.Chart)
    .mark_bar()
    .encode(
        x='yearmonth(report_date):O',
        y='total_miles',
        color='borough',
        # strokeDash='borough'
    )
)

In [314]:
(
    open_streets
    .groupby([
        pd.Grouper(key='report_date',freq='YE'),
        'borough'
    ])
    ['length_mi']
    .mean()
    .rename('avg_miles')
    .reset_index()
    .pipe(alt.Chart)
    .mark_bar()
    .encode(
        x='year(report_date)',
        y='avg_miles',
        color='borough',
        xOffset='borough'
    )
)

In [315]:
(
    open_streets
    .groupby(['sponsor_type','borough'])
    ['open_street']
    .nunique()
    .rename('count_open_streets')
    .reset_index()
    .pipe(alt.Chart)
    .mark_bar()
    .encode(
        x='sponsor_type',
        y='count_open_streets',
        color='borough',
        # xOffset='borough',
        # column='borough'
    )
)

In [20]:
open_streets

Unnamed: 0,report_date,open_street,sponsor,borough,category,total_weekly_hours
0,2024-05-06,Beaumont Avenue,P.S. 32 The Belmont School,Bronx,Full Closure: Schools,9.75
1,2024-05-06,Decatur Avenue,Decatur Block Association,Bronx,Full Closure,19.50
2,2024-05-06,East 150th Street,PS/MS 5 Port Morris School of Community Leader...,Bronx,Full Closure: Schools,40.00
3,2024-05-06,East 212th Street,Uptown & Boogie Healthy Project,Bronx,Full Closure,6.00
4,2024-05-06,Evelyn Place,I AM MY COMMUNITY INC,Bronx,Full Closure,91.00
...,...,...,...,...,...,...
10769,2024-06-04,Deisius Street,31R005,Staten Island,Full Closure: Schools,37.50
10770,2024-06-04,Minthorne Street,Staten Island Chamber of Commerce Foundation I...,Staten Island,Full Closure,21.00
10771,2024-06-04,Suffolk Avenue,Council of Jewish Organizations of Staten Island,Staten Island,Full Closure,31.00
10772,2024-06-04,Suffolk Avenue,Council of Jewish Organizations of Staten Island,Staten Island,Full Closure,31.00


### Interactive charts



In [316]:
(
    open_streets
    .groupby('open_street')
    [[
        'total_weekly_hours',
        'length_mi',
        'borough'
    ]]
    .max()
    .reset_index()
    .pipe(alt.Chart)
    .mark_point()
    .encode(
        x='total_weekly_hours',
        y='length_mi',
        color='borough'
    )
)

In [100]:
(
    open_streets
    .groupby('open_street')
    [[
        'total_weekly_hours',
        'length_mi',
        'borough',
        'sponsor'
    ]]
    .max()
    .reset_index()
    .pipe(alt.Chart)
    .mark_point()
    .encode(
        x='total_weekly_hours',
        y='length_mi',
        color='borough',
        tooltip=[
            'open_street',
            'sponsor',
            'borough',
            'length_mi',
            'total_weekly_hours',
        ]
    )
    .interactive()
)

### Pivot to wide and copy to send to Datawrapper

Use `groupby()` then `unstack()` to pivot to wide. (`pivot()` will work also, if you are not grouping and aggregating)

In [105]:
(
    open_streets
    .groupby([
        pd.Grouper(key='report_date',freq='YE'),
        'borough'
    ])
    ['length_mi']
    .mean()
    .rename('avg_miles')
    .unstack()
    .to_clipboard()
)

In [221]:
count_open_streets_by_quarter_by_borough = (
    open_streets
    .groupby([
        'report_date',
        'borough'
    ])
    ['open_street']
    .nunique()
    .rename('count_open_streets')
    .reset_index()
)

count_open_streets_by_quarter_by_borough

Unnamed: 0,report_date,borough,count_open_streets
0,2020-07-02,Bronx,18
1,2020-07-02,Brooklyn,68
2,2020-07-02,Manhattan,49
3,2020-07-02,Queens,37
4,2020-07-02,Staten Island,4
...,...,...,...
85,2024-10-16,Bronx,22
86,2024-10-16,Brooklyn,45
87,2024-10-16,Manhattan,63
88,2024-10-16,Queens,32


In [232]:
(
    alt.Chart(count_open_streets_by_quarter_by_borough)
    .mark_line()
    .encode(
        x='report_date',
        y='count_open_streets',
        color='borough'
    )
)

Change this to a bar chart by changing just one word: `mark_line` to `mark_bar`

In [226]:
(
    alt.Chart(count_open_streets_by_quarter_by_borough)
    .mark_bar()
    .encode(
        x='report_date',
        y='count_open_streets',
        color='borough'
    )
)

We can make the bars look a little nicer by summarizing them with `yearmonth()` and changing their data type to ordinal with `:O`

In [228]:
(
    alt.Chart(count_open_streets_by_quarter_by_borough)
    .mark_bar()
    .encode(
        x='yearmonth(report_date):O',
        y='count_open_streets',
        color='borough'
    )
)

Making the chart interactive and adding a tooltip makes it possible to better investigate the outliers:

In [233]:
(
    alt.Chart(open_streets)
    .mark_point()
    .encode(
        x='total_weekly_hours',
        y='length_mi',
        color='borough',
        tooltip=[
            'open_street',
            'sponsor',
            'borough',
            'length_mi',
            'total_weekly_hours',
        ]
    )
    .interactive()
)

## Summarize and aggregate within Altair

Option 1: group and summarize the data before piping to altair:

In [154]:
(
    open_streets_quarterly
    .groupby(['report_date','borough'])
    ['total_weekly_hours']
    .mean()
    .rename('avg_total_weekly_hours')
    .reset_index()
    .pipe(alt.Chart)
    .mark_line()
    .encode(
        x='report_date',
        y='avg_total_weekly_hours',
        color='borough',
    )
)

Option 2: use summarizer wrappers around variable names:

In [None]:
(
    alt.Chart(open_streets_quarterly)
    .mark_line()
    .encode(
        x='yearmonth(report_date)',
        y='mean(total_weekly_hours)',
        color='borough'
    )
)

Option 1:

In [None]:
(
    open_streets_quarterly
    .groupby(['report_date','borough'])
    ['length_mi']
    .sum()
    .rename('total_length')
    .reset_index()
    .pipe(alt.Chart)
    .mark_line()
    .encode(
        x='report_date',
        y='total_length',
        color='borough',
    )
)

Option 2:

In [None]:
(
    alt.Chart(open_streets_quarterly)
    .mark_line()
    .encode(
        x='report_date',
        y='sum(length_mi)',
        color='borough',
    )
)

### Stacked vs offset bar plot

In [161]:
(
    alt.Chart(open_streets_quarterly)
    .mark_bar()
    .encode(
        x='year(report_date):O',
        y='sum(length_mi)',
        color='borough'
    )
)

In [158]:
(
    alt.Chart(open_streets_quarterly)
    .mark_bar()
    .encode(
        x='year(report_date):O',
        y='mean(length_mi)',
        color='borough',
        xOffset='borough'
    )
)