#### Importing the Libraries

In [739]:
import altair as alt
import pandas as pd

#### Loading the Data

In [740]:
alt.data_transformers.disable_max_rows()
df = pd.read_csv('filtered_inaturalist.csv')

# Leave following three lines commented to keep all 142k rows
# df = df[df['Class'] == 'Magnoliopsida'] # 55k rows
# df = df[df['Order'] == 'Asterales'] # 10k rows
# df = df[df['Genus'] == 'Acer'] # 1300 rows

# # Viewed more easily within the notebook, but cannot be exported
# df = 'filtered_inaturalist.csv'
# alt.data_transformers.enable('csv')

#### Loading Mass Ave Corridor Data

In [741]:
# Line from one end to the other of Mass Ave Corridor
mass_ave_corridor = pd.DataFrame({
    'Latitude': [42.342770, 42.340637, 42.339316, 42.334992], # 42.331998, 42.320507],
    'Longitude': [-71.084953, -71.081745,  -71.080351, -71.075119] # -71.071242,  -71.061502]
})

center = [ # Center of Mass Ave Corridor
    mass_ave_corridor['Latitude'].mean(),
    mass_ave_corridor['Longitude'].mean()
]

#### Setting the Dimensions

In [742]:
# Normal data bounds
xmin = df['Longitude'].min()
xmax = df['Longitude'].max()
ymin = df['Latitude'].min()
ymax = df['Latitude'].max()

print(xmin, xmax, ymin, ymax)

latRange = 0.02 # Latitude range for the map. Longitude range is double
xmin = max(xmin, center[1] - latRange)
xmax = min(xmax, center[1] + latRange)
ymin = max(ymin, center[0] - latRange / 2)
ymax = min(ymax, center[0] + latRange / 2)

# Filter out-of-bounds data
df = df[df['Latitude'].between(ymin, ymax) & df['Longitude'].between(xmin, xmax)]

ratio = (xmax - xmin) / (ymax - ymin)

height = 200
geoscalar = 1.2 # Amount to scale the size of the geographical distribution
width = height * ratio # Multiply by ratio for equirectangular maps

-71.18998434 -70.87663262 42.23185 42.39328


#### Creating the Chart Parameters

In [743]:
# Selection box drawn across scatterplot
brush = alt.selection_interval()

taxa = ['Kingdom', 'Phylum', 'Subphylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

# Dropdown menu to select the taxon level to be measured
measure_taxon_prm = alt.param(
    value = 'Genus', # Default value
    bind = alt.binding_select(
        options = taxa,
        name = 'Measure Taxon: '
    )
)

# Dropdown menu to select the taxon level to which to apply the filter
filter_taxon_prm = alt.param(
    value = 'Class', # Default value
    bind = alt.binding_select(
        options = taxa[:-1],
        name = 'Filter Taxon: '
    )
)

# Search box to filter values at the specified taxon level
filter_value_prm = alt.param(
    value = 'Magnoliopsida', # Default value
    bind = alt.binding(
        input = 'search',
        placeholder = 'Search',
        name = 'Filter Value: '
    )
)

# Max value slider for the histogram that I have commented out of implementation
hist_range_prm = alt.param(
    value = 1, # Default value
    bind = alt.binding_range(
        min = 0, max = 1, # Represents 0% to 100% of x-axis length (logarithmic)
        name = 'Histogram Ceiling: '
    )
)

# Slider for number of bars in bar chart
bar_slider_prm = alt.param(
    value = 1,
    bind = alt.binding_range(
        min = 1, max = 90, step = 1,
        name = 'Bar Chart Rank: '
    )
)

#### Creating the Expression for Pluralizing Taxa

In [744]:
plurals = {
    'phylum': 'phyla',
    'class': 'classes',
    'genus': 'genera',
    'species': 'species'
}

plexpr = '{} + "s"'
for k, v, in plurals.items():
    plexpr = f'if(test(/{k}$/, {{}}), replace({{}}, /{k}$/, "{v}"), {plexpr})'
plexpr = plexpr.replace('{}', 'lower({})')

def pluralTaxon(param = measure_taxon_prm, proper = True):
    global plexpr
    if proper:
        plexpr = f'upper(slice({plexpr}, 0, 1)) + slice({plexpr}, 1)'
    return plexpr.replace('{}', param.name)

#### Creating the Base Chart

In [745]:
# Base chart involving data, dimensions, and measure/filter parameters
base = alt.Chart(df).add_params(
    measure_taxon_prm, filter_taxon_prm, filter_value_prm
).properties(
    height = height, width = height
).transform_calculate(
    measure_taxon = f'datum[{measure_taxon_prm.name}]',
    filter_taxon = f'datum[{filter_taxon_prm.name}]'
).transform_filter( # Applying the filter as according to the filter taxon and value
    alt.expr.test(alt.expr.regexp(filter_value_prm, 'i'), alt.datum.filter_taxon)
)

# The color to use throughout the visualization
color = alt.Color('measure_taxon:N', scale = alt.Scale(scheme = 'category20'), legend = None)

#### Creating the Scatterplot (Map)

In [746]:
scatter = base.add_params(brush).mark_point(
    filled = True,
    size = height / 40, # You can change the point size depending on the spread
    opacity = 0.9
).encode( # Pretending X/Y are Lat/Long because otherwise linking fails because Altair Moment™
    x = alt.X('Longitude:Q', axis = None, scale = alt.Scale(domain = [xmin, xmax])),
    y = alt.Y('Latitude:Q', axis = None, scale = alt.Scale(domain = [ymin, ymax])),
    color = alt.condition(brush, color, alt.value('darkgray'))
).properties( # Making the width match an equirectangular map
    height = height * geoscalar, width = width * geoscalar,
    title = alt.Title(
        alt.expr(f'"Geographical Distribution of " + {pluralTaxon()}'),
        subtitle = 'Mass Ave Corridor is highlighted in black'
    )
)

#### Creating the Base Chart for Measuring Taxon Size

In [747]:
countbase = base.transform_filter(
    brush & 'isValid(datum.measure_taxon)'
).transform_aggregate(
    count = 'count()', groupby = ['measure_taxon']
).transform_joinaggregate(
    sum = 'sum(count)'
).transform_calculate( # Converting count to percentage share
    percentage = 'datum.count / datum.sum'
).transform_calculate( # Tooltip for percentage
    percent_text = 'replace(format(datum.percentage, ".1%"), "0.0%", "< 0.1%")'
).transform_window( # Ranking the bars in order to limit them
    row_number = 'row_number(count)',
    sort = [alt.SortField('count', order = 'descending')]
).encode(
    tooltip = [
        alt.Tooltip('measure_taxon:N', title = 'Taxon Name'),
        alt.Tooltip('count:Q', title = 'Population Size'),
        alt.Tooltip('percent_text:N', title = 'Percentage of Total')
    ]
)

#### Creating the Bar Chart

In [748]:
barbase = countbase.add_params(
    bar_slider_prm
).transform_filter( # Limiting the number of bars
    f'{alt.datum.row_number} >= {bar_slider_prm.name} & {alt.datum.row_number} < {bar_slider_prm.name} + 10'
).encode(
    x = alt.X(
        'percentage:Q',
        title = 'Percentage of Total',
        axis = alt.Axis( # Displays integer axis ticks only, as percentages
            labelExpr = '(datum.value * 100) % 1 ? null : format(datum.label, ".0%")',
            grid = False
        )
    ),
    y = alt.Y(
        'measure_taxon:N',
        axis = alt.Axis(labelLimit = 72),
        sort = '-x', # Sorting bars in descending order
        title = None
    ),
    text = 'percent_text:N'
).properties(
    title = alt.Title(
        alt.expr(f'"Most Popular " + {pluralTaxon()} + " (Showing " + ' +\
                 f'{bar_slider_prm.name} + "-" + ({bar_slider_prm.name} + 9) + ")"')
    )
)

bars = barbase.mark_bar().encode(
    color = color
) + barbase.mark_text(align = 'left', dx = 2)

#### Creating the Histogram

In [749]:
hist = countbase.transform_calculate(
    logct = 'log(datum.count) / log(10)'
# ).transform_joinaggregate(
#     logmax = 'max(logct)'
# ).transform_calculate(
#     logperc = 'datum.logct / datum.logmax'
# ).add_params(
#     hist_range_prm
# ).transform_filter(
#     f'{alt.datum.logperc} <= {hist_range_prm.name}'
).encode(
    x = alt.X( # Logarithmic axis makes most sense for population size
        'logct:Q',
        title = 'Population Size (Logarithmic)',
        bin = alt.Bin(maxbins = 16), # Modify bin size as looks best
        axis = alt.Axis( # Displaying integer ticks as exponents
            labelExpr = 'datum.value % 1 ? null : "10^" + format(datum.label, ".0f")'
        )
    ),
    y = alt.Y(
        'count():Q',
        title = 'Number of Taxa',
        axis = alt.Axis( # Displays integer axis ticks only
            labelExpr = 'datum.value % 1 ? null : format(datum.label, ".0f")'
        )
    ),
    color = color
).properties(
    title = alt.Title(alt.expr(f'"Distribution of " + {measure_taxon_prm.name} + " Size"')),
).mark_bar()

#### Setting Map Bounds

In [750]:
# Do I really need to cite this if I modelled this after the documentation?
# Source: https://altair-viz.github.io/user_guide/marks/geoshape.html
geobounds = alt.Feature(
    type = 'Feature',
    properties = {},
    geometry = alt.Geometry(
        alt.Polygon(
            type = 'Polygon',
            coordinates = [[
                [xmax, ymax],
                [xmax, ymin],
                [xmin, ymin],
                [xmin, ymax],
                [xmax, ymax]
            ]]
        )
    )
)

#### Creating the Map Background(s)

In [751]:
# The following code is taken from the Altair Tutorial done in class on 5/23/23

# Boston GeoJSON data
boston_url = 'https://raw.githubusercontent.com/lsouth/DS4200/main/Boston_Neighborhoods.json'
boston = alt.topo_feature(boston_url, feature = 'Boston_Neighborhoods')

# Massachusetts GeoJSON data
massachusetts = alt.topo_feature('MassTowns.geojson', 'Towns')

# Geoshape of Boston to serve as a background for the scatterplot
def markgeoshape(feature):
    return alt.Chart(feature).mark_geoshape(
        fill = '#DFDFDF',
        stroke = 'white',
        strokeWidth = 2,
        clip = True
    ).project(
        type = 'equirectangular',
        fit = geobounds
    ).properties(
        height = height * geoscalar,
        width = width * geoscalar
    )

# Map of Boston
boston_map = markgeoshape(boston).encode(
    tooltip = 'properties.Name:N'
)

# Map of Massachusetts
mass_map = markgeoshape(massachusetts)

# Line denoting Mass Ave Corridor
mac_line = alt.Chart(mass_ave_corridor).encode(
    x = alt.X('Longitude:Q', axis = None, scale = alt.Scale(domain = [xmin, xmax])),
    y = alt.Y('Latitude:Q', axis = None, scale = alt.Scale(domain = [ymin, ymax])),
    color = alt.value('#444444')
).mark_line(clip = True)

#### Exporting the Visualization

In [752]:
# Overlaying the scatterplot onto the map
scatterMap = (mass_map + boston_map) + (mac_line + scatter)

chart = alt.vconcat(scatterMap, (bars | hist)).configure(background = '#E8FCFF')
chart.save('inatvis_demo.html')