In [1]:
# The export from a gcloud shell.
# It does not include all columns in the table because a dump of all columns
# yielded a bad csv probably because of errant quotes or commas within some entries.

# gcloud sql export csv software-usage-stats gs://logging-natcap/model_log_table-2021-09-14.csv --query="SELECT 'model_name', 'invest_release', 'system_full_platform_string', 'time' UNION SELECT model_name, invest_release, system_full_platform_string, time FROM model_log_table" --database=invest_model_usage

In [2]:
import pandas
import altair

pandas.set_option('display.max_rows', 100)

In [3]:
table_path = 'model_log_table-2021-09-14.csv'
df = pandas.read_csv(table_path)
df.head()

Unnamed: 0,model_name,invest_release,system_full_platform_string,time
0,natcap.invest.carbon,3.4.0rc1.post8+n2663c4de63f0,Linux-4.9.0-3-amd64-x86_64-with-debian-9.0,2017-10-30T20:12:13.657Z
1,natcap.invest.carbon,3.4.0rc1.post11+n69791f0c0b15,Linux-4.9.0-3-amd64-x86_64-with-debian-9.0,2017-11-01T17:49:42.512Z
2,natcap.invest.pollination,3.4.0rc1.post12+n08f8f5f39d6b,Linux-4.9.0-3-amd64-x86_64-with-debian-9.0,2017-11-01T20:30:28.764Z
3,test_ui_inputs,3.4.0rc1.post13+n594dee3c80e2,Windows-2008ServerR2-6.1.7601-SP1,2017-11-01T22:43:11.536Z
4,natcap.invest.hydropower.hydropower_water_yield,3.3.3,Windows-8-6.2.9200,2017-11-02T23:32:08.345Z


In [4]:
# df.model_name.unique()

# some are clearly bad data, others are not invest models.
# some are preprocessors and dropped because they are not useful
# as standalone models and each of their runs should also be represented
# by a count of the actual model.
drop_models = [
    'mock.mock',
    'opal.core.gui',
    'OPAL.opal.sediment_sm.gui',
    'OPAL.opal.core.gui',
    'OPAL.opal.carbon_sm.gui',
    'natcap.invest.forage',
    'rangeland_production.forage',
    'execute',
    'testing',
    'natcap.invest.nearshore_wave_and_erosion.nearshore_wave_and_erosion',
    'natcap.invest.xbeach_storm_impact',
    '__main__',
    'natcap.invest.xbeach_storm_surge',
    'model_name',
    'test_ui_inputs',
    'mesh_models.mesh_scenario_generator',
    'N,N',
    'natcap.invest.coastal_blue_carbon.preprocessor',
    'natcap.invest.blue_carbon.blue_carbon_preprocessor',
    'natcap.invest.habitat_risk_assessment.hra_preprocessor',
    'natcap.invest.crop_production.crop_production',
    'natcap.invest.carbon.carbon_combined',
    'natcap.invest.habitat_suitability'
]
total_rows = len(df)
df = df[~df['model_name'].isin(drop_models)]
print(f'dropping {total_rows - len(df)} rows for models we dont care about')
# df['model_name'].value_counts()

dropping 22263 rows for models we dont care about


In [5]:
# Shorten names for convenience & readability
df['model'] = df['model_name'].apply(lambda x: x.split('.').pop())

# Reassign some names that seem to have changed over time
names_map = {
    'recreation': 'recmodel_client',
    'blue_carbon': 'coastal_blue_carbon',
    'coastal_blue_carbon2': 'coastal_blue_carbon',
    'nutrient': 'ndr',
    'delineateit2': 'delineateit',
    'overlap_analysis_mz': 'overlap_analysis',
    'urban_heat_island_mitigation': 'urban_cooling_model',
}
def reassign_name(name):
    try:
        return(names_map[name])
    except KeyError:
        return name

df['model'] = df['model'].apply(reassign_name)
df['model'].value_counts()

sdr                                 50205
hydropower_water_yield              41123
carbon                              31239
habitat_quality                     26428
ndr                                 23988
seasonal_water_yield                15555
recmodel_client                     15555
fisheries                            8525
pollination                          7344
hra                                  6778
scenario_generator                   6321
coastal_vulnerability                6040
urban_cooling_model                  5297
coastal_blue_carbon                  4911
delineateit                          3048
scenario_gen_proximity               3017
urban_flood_risk_mitigation          2089
scenic_quality                       2001
forest_carbon_edge_effect            1812
routedem                             1739
crop_production_percentile           1550
wind_energy                          1344
root                                 1144
crop_production_regression        

In [6]:
# Make a proper datetime datatype
df['datetime'] = pandas.to_datetime(df['time'], utc=True)
total_rows = len(df)
df.dropna(subset=['datetime'], inplace=True)
print(f'dropping {total_rows - len(df)} rows with missing data')
# df.head()

dropping 0 rows with missing data


In [7]:
# Aggregate to a time frequency so we can count runs per unit of time per model
frequency = 'M'
data = df.groupby([
    pandas.Grouper(key='datetime', freq=frequency),
    pandas.Grouper(key='model')]).size().reset_index(name='counts')
data.head()

Unnamed: 0,datetime,model,counts
0,2015-09-30 00:00:00+00:00,scenario_gen_proximity,1
1,2015-09-30 00:00:00+00:00,sdr,2
2,2015-10-31 00:00:00+00:00,forest_carbon_edge_effect,24
3,2015-10-31 00:00:00+00:00,globio,42
4,2015-10-31 00:00:00+00:00,hra,11


In [8]:
# For the benefit of plots, fill in 0s where no models were run
wide = data.pivot(index='datetime', columns='model', values='counts')
wide.fillna(0, inplace=True)
months_with_counts = len(wide)

# And in case there were months where no models were run
# And in case the first & last months are incomplete (assume they are)
# trim them off with offsets
begin = wide.index.min() + pandas.offsets.MonthBegin()
end = wide.index.max() - pandas.offsets.MonthEnd()
date_range = pandas.date_range(begin, end, freq=frequency)
wide = wide.reindex(date_range, fill_value=0)
print(f'complete data from {begin} to {end}')

complete data from 2015-10-01 00:00:00+00:00 to 2021-08-31 00:00:00+00:00


In [9]:
# Format data for altair
wide = wide.reset_index() # altair cannot plot indices, so move date to normal column
long = pandas.melt(wide, id_vars='index')
# long.head()

In [17]:
def plot_model_counts_over_time(model_list, title=None):
    altair.data_transformers.disable_max_rows()
    selection = altair.selection_multi(fields=['model'], bind='legend')

    to_plot = long[long['model'].isin(model_list)]
    return (
        altair.Chart(to_plot).mark_line().encode(
            altair.X('index:T', axis=altair.Axis(format='%Y-%m'), title=None),
            altair.Y('value:Q', title='runs per month'),
            color=altair.Color('model', scale=altair.Scale(scheme='category10')),
            opacity=altair.condition(selection, altair.value(1), altair.value(0.2)),
            size=altair.value(1)
        ).properties(
            width=800,
            height=300,
            title=title
        ).add_selection(
            selection
        ).configure_axis(
            grid=False
        )
    )

In [18]:
all_models_counts = long.groupby('index').sum().reset_index()
altair.Chart(all_models_counts).mark_line().encode(
    altair.X('index:T', axis=altair.Axis(format='%Y-%m'), title=None),
    altair.Y('value:Q', title='runs per month'),
    size=altair.value(1)
).properties(
    width=800,
    height=300,
    title='all models'
).configure_axis(
    grid=False
)

## These plots are interactive - click a series in the legend

In [22]:
high_use_models = ['sdr',
                   'hydropower_water_yield',
                   'carbon',
                   'habitat_quality',
                   'ndr',
                   'seasonal_water_yield',
                   'recmodel_client',
                   'fisheries'
                  ]
plot_model_counts_over_time(high_use_models, 'high-use models')

In [19]:
mid_use_models = ['pollination',
                  'hra',
                  'scenario_generator',
                  'coastal_vulnerability',
                  'urban_cooling_model',
                  'coastal_blue_carbon'
                 ]
plot_model_counts_over_time(mid_use_models, 'mid-use models')

In [20]:
mid_low_use_models = [
    'delineateit',
    'scenario_gen_proximity',
    'urban_flood_risk_mitigation',
    'scenic_quality',
    'routedem',
    'crop_production_percentile',
    'wind_energy',
    'crop_production_regression',
]
plot_model_counts_over_time(mid_low_use_models, 'mid-low-use models')

In [21]:
low_use_models = [
    'overlap_analysis',
    'wave_energy',
    'finfish_aquaculture',
    'fisheries_hst',
    'forest_carbon_edge_effect',
    'globio',
    'marine_water_quality_biophysical',
    'timber',
]
plot_model_counts_over_time(low_use_models, 'low-use models (including some that are already deprecated)')