In [None]:
# Export from a gcloud shell.
# It does not include all columns in the table because a dump of all columns
# yielded a bad csv probably because of this known issue with exporting NULL values:
# https://cloud.google.com/sql/docs/mysql/known-issues#import-export

# gcloud sql export csv --escape="5C" software-usage-stats gs://logging-natcap/model_log_table-2021-12-10.csv --query="SELECT 'model_name', 'invest_release', 'invest_interface', 'system_full_platform_string', 'time' UNION SELECT model_name, invest_release, invest_interface, system_full_platform_string, time FROM model_log_table" --database=invest_model_usage

# From a local shell:
# gsutil cp gs://logging-natcap/model_log_table-2021-12-10.csv .

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas
import altair

from load_and_clean import load_and_clean_csv

pandas.set_option('display.max_rows', 100)

In [3]:
table_path = 'model_log_table-2022-10-05.csv'

In [4]:
df = load_and_clean_csv(table_path)

dropping 22972 rows for models we dont care about
remaining model counts:
sdr                                 56209
annual_water_yield                  41955
carbon                              40006
habitat_quality                     31506
ndr                                 28102
seasonal_water_yield                19959
recmodel_client                     18102
hra                                  9732
fisheries                            8557
pollination                          8384
delineateit                          8234
urban_cooling_model                  8028
coastal_vulnerability                6868
scenario_generator                   6472
coastal_blue_carbon                  5922
urban_flood_risk_mitigation          3881
scenario_gen_proximity               3866
scenic_quality                       2575
routedem                             2055
forest_carbon_edge_effect            2007
crop_production_percentile           1919
wind_energy                          1490
cr

# Workbench stats

In [5]:
beta_release_date = pandas.to_datetime('2021-11-19', utc=True)
post_wb_runs = df[df['datetime'] >= beta_release_date]
print(f'{post_wb_runs.shape[0]} invest runs since workbench release')
wb_runs = post_wb_runs[post_wb_runs['invest_interface'].str.contains('Workbench 0.1.0-beta')]
print(f'including {wb_runs.shape[0]} Workbench 0.1.0-beta runs')
wb_runs

43385 invest runs since workbench release
including 1639 Workbench 0.1.0-beta runs


Unnamed: 0,model_name,invest_release,invest_interface,system_full_platform_string,time,model,datetime
302398,natcap.invest.pollination,3.9.2.post480+gc06f40d7,Workbench 0.1.0-beta,Windows-10-10.0.19043-SP0,2021-11-19T19:13:22.531Z,pollination,2021-11-19 19:13:22.531000+00:00
302399,natcap.invest.delineateit.delineateit,3.9.2.post480+gc06f40d7,Workbench 0.1.0-beta,macOS-10.13.6-x86_64-i386-64bit,2021-11-19T19:24:50.201Z,delineateit,2021-11-19 19:24:50.201000+00:00
302501,natcap.invest.urban_cooling_model,3.9.2.post480+gc06f40d7,Workbench 0.1.0-beta,Windows-10-10.0.19043-SP0,2021-11-20T22:08:38.257Z,urban_cooling_model,2021-11-20 22:08:38.257000+00:00
303115,natcap.invest.carbon,3.9.2.post480+gc06f40d7,Workbench 0.1.0-beta,Windows-10-10.0.19042-SP0,2021-11-25T04:57:48.557Z,carbon,2021-11-25 04:57:48.557000+00:00
303680,natcap.invest.habitat_quality,3.9.2.post480+gc06f40d7,Workbench 0.1.0-beta,Windows-10-10.0.19042-SP0,2021-11-29T13:16:16.251Z,habitat_quality,2021-11-29 13:16:16.251000+00:00
...,...,...,...,...,...,...,...
346230,natcap.invest.annual_water_yield,3.12.0,Workbench 0.1.0-beta,Windows-10-10.0.22000-SP0,2022-10-05T08:52:27.291Z,annual_water_yield,2022-10-05 08:52:27.291000+00:00
346231,natcap.invest.carbon,3.12.0,Workbench 0.1.0-beta,Windows-10-10.0.22621-SP0,2022-10-05T08:53:05.843Z,carbon,2022-10-05 08:53:05.843000+00:00
346234,natcap.invest.sdr.sdr,3.12.0,Workbench 0.1.0-beta,Windows-10-10.0.22000-SP0,2022-10-05T09:25:52.376Z,sdr,2022-10-05 09:25:52.376000+00:00
346238,natcap.invest.sdr.sdr,3.12.0,Workbench 0.1.0-beta,Windows-10-10.0.22000-SP0,2022-10-05T09:45:06.114Z,sdr,2022-10-05 09:45:06.114000+00:00


# Monthly counts by model

In [6]:
# Aggregate to a time frequency so we can count runs per unit of time per model
frequency = 'M'
data = df.groupby([
    pandas.Grouper(key='datetime', freq=frequency),
    pandas.Grouper(key='model')]).size().reset_index(name='counts')
data.head()

Unnamed: 0,datetime,model,counts
0,2015-09-30 00:00:00+00:00,scenario_gen_proximity,1
1,2015-09-30 00:00:00+00:00,sdr,2
2,2015-10-31 00:00:00+00:00,forest_carbon_edge_effect,24
3,2015-10-31 00:00:00+00:00,globio,42
4,2015-10-31 00:00:00+00:00,hra,11


In [7]:
# For the benefit of plots, fill in 0s where no models were run
wide = data.pivot(index='datetime', columns='model', values='counts')
wide.fillna(0, inplace=True)
months_with_counts = len(wide)

# And in case there were months where no models were run
# And in case the first & last months are incomplete (assume they are)
# trim them off with offsets
begin = wide.index.min() + pandas.offsets.MonthBegin()
end = wide.index.max() - pandas.offsets.MonthEnd()
date_range = pandas.date_range(begin, end, freq=frequency)
wide = wide.reindex(date_range, fill_value=0)
print(f'complete data from {begin} to {end}')

complete data from 2015-10-01 00:00:00+00:00 to 2022-09-30 00:00:00+00:00


In [8]:
# Format data for altair
wide = wide.reset_index() # altair cannot plot indices, so move date to normal column
long = pandas.melt(wide, id_vars='index')
# long.head()

In [9]:
def plot_model_counts_over_time(model_list, title=None):
    altair.data_transformers.disable_max_rows()
    selection = altair.selection_multi(fields=['model'], bind='legend')

    to_plot = long[long['model'].isin(model_list)]
    return (
        altair.Chart(to_plot).mark_line().encode(
            altair.X('index:T', axis=altair.Axis(format='%Y-%m'), title=None),
            altair.Y('value:Q', title='runs per month'),
            color=altair.Color('model', scale=altair.Scale(scheme='category10')),
            opacity=altair.condition(selection, altair.value(1), altair.value(0.2)),
            size=altair.value(1)
        ).properties(
            width=800,
            height=300,
            title=title
        ).add_selection(
            selection
        ).configure_axis(
            grid=False,
            labelFontSize=12,
            titleFontSize=14,
            titlePadding=15
        ).configure_legend(
            labelFontSize=12,
            title=None
        )
    )

In [10]:
all_models_counts = long.groupby('index').sum().reset_index()
altair.Chart(all_models_counts).mark_line().encode(
    altair.X('index:T', axis=altair.Axis(format='%Y-%m'), title=None),
    altair.Y('value:Q', title='runs per month'),
    size=altair.value(1)
).properties(
    width=800,
    height=300,
    title='all models'
).configure_axis(
    grid=False
)

## These plots are interactive - click a series in the legend

In [None]:
high_use_models = ['sdr',
                   'hydropower_water_yield',
                   'carbon',
                   'habitat_quality',
                   'ndr',
                   'seasonal_water_yield',
                   'recmodel_client',
                   'fisheries'
                  ]
plot_model_counts_over_time(high_use_models, 'high-use models')

In [None]:
mid_use_models = ['pollination',
                  'hra',
                  'scenario_generator',
                  'coastal_vulnerability',
                  'urban_cooling_model',
                  'coastal_blue_carbon'
                 ]
plot_model_counts_over_time(mid_use_models, 'mid-use models')

In [None]:
mid_low_use_models = [
    'delineateit',
    'scenario_gen_proximity',
    'urban_flood_risk_mitigation',
    'scenic_quality',
    'routedem',
    'crop_production_percentile',
    'wind_energy',
    'crop_production_regression',
]
plot_model_counts_over_time(mid_low_use_models, 'mid-low-use models')

In [None]:
low_use_models = [
    'overlap_analysis',
    'wave_energy',
    'finfish_aquaculture',
    'fisheries_hst',
    'forest_carbon_edge_effect',
    'globio',
    'marine_water_quality_biophysical',
    'timber',
]
plot_model_counts_over_time(low_use_models, 'low-use models (including some that are already deprecated)')

In [None]:
long[long['model'] == 'wave_energy']