#### Importing the Libraries

In [363]:
import altair as alt
import pandas as pd

#### Loading the Data

In [364]:
alt.data_transformers.disable_max_rows()
df = pd.read_csv('filtered_inaturalist.csv')

# Leave following three lines commented to keep all 142k rows
# df = df[df['Class'] == 'Magnoliopsida'] # 55k rows
# df = df[df['Order'] == 'Sapindales'] # 3k rows
# df = df[df['Genus'] == 'Acer'] # 1300 rows

# # Viewed more easily within the notebook, but cannot be exported
# df = 'filtered_inaturalist.csv'
# alt.data_transformers.enable('csv')

#### Setting the Dimensions

In [365]:
xmin = df['Longitude'].min()
xmax = df['Longitude'].max()
ymin = df['Latitude'].min()
ymax = df['Latitude'].max()

ratio = (xmax - xmin) / (ymax - ymin)

height = 200
geoscalar = 1.2 # Amount to scale the size of the geographical distribution
width = height * ratio # Multiply by ratio for equirectangular maps

#### Creating the Chart Parameters

In [366]:
# Selection box drawn across scatterplot
brush = alt.selection_interval()

taxa = ['Kingdom', 'Phylum', 'Subphylum', 'Class', 'Order', 'Genus', 'Species']

# Dropdown menu to select the taxon level to be measured
measure_taxon_prm = alt.selection_point(
    fields = ['measure_taxon'],
    value = 'Genus', # Default value
    bind = alt.binding_select(
        options = taxa,
        name = 'Measure Taxon: '
    )
)

# Dropdown menu to select the taxon level to which to apply the filter
filter_taxon_prm = alt.selection_point(
    fields = ['filter_taxon'],
    value = 'Class', # Default value
    bind = alt.binding_select(
        options = taxa[:-1],
        name = 'Filter Taxon: '
    )
)

# Search box to filter values at the specified taxon level
filter_value_prm = alt.param(
    value = 'Magnoliopsida', # Default value
    bind = alt.binding(
        input = 'search',
        placeholder = 'Search',
        name = 'Filter Value: '
    )
)

# Max value slider for the histogram that I have commented out of implementation
hist_range_prm = alt.param(
    value = 1, # Default value
    bind = alt.binding_range(
        min = 0, max = 1, # Represents 0% to 100% of x-axis length (logarithmic)
        name = 'Histogram Ceiling: '
    )
)

#### Creating the Base Chart

In [367]:
# Base chart involving data, dimensions, and measure/filter parameters
base = alt.Chart(df).properties(
    height = height, width = height
).transform_fold( # Creating new measure taxon column
    taxa, as_ = ['measure_taxon', 'measure_value']
).transform_fold( # Creating new filter taxon column
    taxa[:-1], as_ = ['filter_taxon', 'filter_value']
).transform_filter( # Selecting the measure and filter taxa
    measure_taxon_prm & filter_taxon_prm
).transform_filter( # Applying the filter as according to the filter taxon and value
    alt.expr.test(alt.expr.regexp(filter_value_prm, 'i'), alt.datum.filter_value)
)

#### Creating the Scatterplot (Map)

In [368]:
scatter = base.add_params( # All of the physical parameters are on this chart only
    brush, filter_taxon_prm, filter_value_prm, measure_taxon_prm
).mark_point(
    filled = True,
    size = height / 150, # You can change the point size depending on the spread
    opacity = 0.8
).encode( # Pretending X/Y are Lat/Long because otherwise linking fails because Altair Moment™
    x = alt.X('Longitude:Q', axis = None, scale = alt.Scale(domain = [xmin, xmax])),
    y = alt.Y('Latitude:Q', axis = None, scale = alt.Scale(domain = [ymin, ymax])),
    color = alt.condition(brush, 'measure_value:N', alt.value('darkgray'), legend = None)
).properties( # Making the width match an equirectangular map
    height = height * geoscalar, width = width * geoscalar,
    title = 'Geographical Distribution'
)

#### Creating the Base Chart for Measuring Taxon Size

In [369]:
countbase = base.transform_filter(
    brush
).transform_filter(
    'isValid(datum.measure_value) & datum.measure_value != \'\''
).transform_aggregate(
    count = 'count()', groupby = ['measure_value']
)

#### Creating the Bar Chart

In [370]:
bars = countbase.transform_joinaggregate(
    sum = 'sum(count)'
).transform_calculate( # Converting count to percentage share
    percentage = 'datum.count / datum.sum'
).transform_window( # Ranking the bars in order to limit them
    rank = 'rank(count)',
    sort = [alt.SortField('count', order = 'descending')]
).transform_filter( # Limiting the number of bars
    alt.datum.rank <= 10
).encode(
        x = alt.X(
        'percentage:Q',
        title = 'Percentage Share',
        axis = alt.Axis( # Displays integer axis ticks only, as percentages
            labelExpr = '(datum.value * 100) % 1 ? null : format(datum.label, \'.0%\')'
        )
    ),
    y = alt.Y(
        'measure_value:N',
        sort = '-x', # Sorting bars in descending order
        title = None
    ),
    text = alt.Text('percentage:Q', format = '.1%'),
    tooltip = [alt.Tooltip('count:Q', title = 'Population Size')]
).properties(
    title = 'Top 10 Taxa By Regional Frequency'
)

bars = bars.mark_bar().encode(
    color = alt.Color('measure_value:N', legend = None)
) + bars.mark_text(align = 'left', dx = 2)

#### Creating the Histogram

In [371]:
hist = countbase.transform_calculate(
    logct = 'log(datum.count) / log(10)'
# ).transform_joinaggregate(
#     logmax = 'max(logct)'
# ).transform_calculate(
#     logperc = 'datum.logct / datum.logmax'
# ).add_params(
#     hist_range_prm
# ).transform_filter(
#     # Stack Overflow said this was impossible
#     # Who's laughing now? Not me I have no sanity left this took 5 hours
#     f'{alt.datum.logperc} <= {hist_range_prm.name}'
).encode(
    x = alt.X( # Logarithmic axis makes most sense for population size
        'logct:Q',
        title = 'Population Size',
        bin = alt.Bin(maxbins = 16), # Modify bin size as looks best
        axis = alt.Axis( # Displaying integer ticks as exponents
            labelExpr = 'datum.value % 1 ? null : \'10^\' + format(datum.label, \'.0f\')'
        )
    ),
    y = alt.Y(
        'count():Q',
        title = 'Number of Taxa',
        axis = alt.Axis( # Displays integer axis ticks only
            labelExpr = 'datum.value % 1 ? null : format(datum.label, \'.0f\')'
        )
    ),
    # color = alt.Color('measure_value:N', legend = None),
    # tooltip = [
    #     alt.Tooltip('measure_value:N', title = 'Taxon Name'),
    #     alt.Tooltip('count:Q', title = 'Population Size')
    # ]
).properties(
    title = 'Distribution of Taxon Size'
).mark_bar()

#### Creating the Map Background

In [372]:
# Do I really need to cite this if I modelled this after the documentation?
# Source: https://altair-viz.github.io/user_guide/marks/geoshape.html
geobounds = alt.Feature(
    type = 'Feature',
    properties = {},
    geometry = alt.Geometry(
        alt.Polygon(
            type = 'Polygon',
            coordinates = [[
                [xmax, ymax],
                [xmax, ymin],
                [xmin, ymin],
                [xmin, ymax],
                [xmax, ymax]
            ]]
        )
    )
)

# The following code is taken from the Altair Tutorial done in class on 5/23/23
boston_url = 'https://raw.githubusercontent.com/lsouth/DS4200/main/Boston_Neighborhoods.json'
boston = alt.topo_feature(boston_url, feature = 'Boston_Neighborhoods')

# Geoshape of Boston to serve as a background for the scatterplot
boston_map = alt.Chart(boston).mark_geoshape(
    fill = 'lightgray',
    stroke = 'white',
    clip = True
).encode(
    tooltip = 'properties.Name:N'
).project(
    type = 'equirectangular',
    fit = geobounds
).properties(
    height = height * geoscalar,
    width = width * geoscalar
)

# Overlaying the scatterplot onto the map
scatter = boston_map + scatter

#### Exporting the Visualization

In [373]:
chart = alt.vconcat(scatter, (bars | hist))
chart.save('inatvis.html')