In [1]:
%%capture
pip install plotly pandas statsmodels kaleido

In [2]:
# read CSV data

import glob
import re
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import os.path
import pickle

output_directory = '../output-clean'
figures_directory = '../../paper-icse-2024-linux/figures'
default_height = 300

def read_dataframe(stage, dtype={}, usecols=None, file=None):
    if not file:
        file = 'output'
    df = pd.read_csv(f'{output_directory}/{stage}/{file}.csv', dtype=dtype, usecols=usecols)
    if 'committer_date_unix' in df:
        df['committer_date'] = df['committer_date_unix'].apply(lambda d: pd.to_datetime(d, unit='s'))
    return df

def replace_values(df):
    df.replace('kconfigreader', 'KConfigReader', inplace=True)
    df.replace('kmax', 'KClause', inplace=True)

df_architectures = read_dataframe(f'read-linux-architectures')
df_architectures = df_architectures.sort_values(by='committer_date')
df_architectures['year'] = df_architectures['committer_date'].apply(lambda d: int(d.year))

df_configs = read_dataframe(f'read-linux-configs')
df_configs = df_configs[~df_configs['kconfig-file'].str.contains('/um/')]

df_config_types = read_dataframe(f'read-linux-configs', file='output.types')
df_config_types = df_config_types[~df_config_types['kconfig-file'].str.contains('/um/')]

df_kconfig = read_dataframe('kconfig')
df_kconfig['year'] = df_kconfig['committer_date'].apply(lambda d: int(d.year))

df_uvl = read_dataframe('model_to_uvl_featureide')
df_xml = read_dataframe('model_to_xml_featureide')
df_smt = read_dataframe('model_to_smt_z3')
df_dimacs = read_dataframe('dimacs')
df_backbone_dimacs = read_dataframe('backbone-dimacs')

df_solve = read_dataframe('solve_model-count', {'model-count': 'string'})
df_solve['model-count'] = df_solve['model-count'].replace('1', '')
df_solve['model-count-log10'] = df_solve['model-count'].fillna('').map(len).replace(0, np.nan)
df_solve['year'] = df_solve['committer_date'].apply(lambda d: int(d.year))

for df in [df_kconfig, df_uvl, df_xml, df_smt, df_dimacs, df_backbone_dimacs, df_solve]:
    replace_values(df)

In [3]:
# helper functions for drawing plots

def estimate_group(group):
    print('\multicolumn{6}{l}{' + group + '} \\\\')

def estimate_trend(fig, message='', idx=0, date1=pd.Timestamp.now() - pd.Timedelta(days=365.25*20), date2=pd.Timestamp.now()):
    results = px.get_trendline_results(fig)
    intercept = results.iloc[idx]['px_fit_results'].params[0]
    slope = results.iloc[idx]['px_fit_results'].params[1]
    daily = slope * pd.to_timedelta(1, unit='D').total_seconds()
    monthly = slope * pd.to_timedelta(1, unit='D').total_seconds() * 30.437
    yearly = slope * pd.to_timedelta(1, unit='D').total_seconds() * 365.25
    on_date1 = intercept + slope * date1.timestamp()
    on_date2 = intercept + slope * date2.timestamp()
    print('\hspace*{1mm} ' + f'{message} & {round(daily):,} & {round(monthly):,} & {round(yearly):,} & {round(on_date1):,} & {round(on_date2):,} \\\\ ')

def committer_date_x_axis(fig, df=df_kconfig, append_revision=True):
    axis = df_kconfig[['committer_date', 'revision']].drop_duplicates()
    axis['year'] = axis['committer_date'].apply(lambda d: str(d.year))
    axis = axis.sort_values(by='committer_date').groupby('year').nth(0).reset_index()
    fig.update_xaxes(
        ticktext=axis['year'].str.cat('<br><sup>' + axis['revision'].str[1:] + '</sup>')[1:] if append_revision else axis['year'],
        tickvals=axis['year'][1:]
    )

def revision_x_axis(fig, df=df_kconfig):
    axis = df_kconfig[['committer_date', 'revision']].drop_duplicates()
    axis['year'] = axis['committer_date'].apply(lambda d: str(d.year))
    axis = axis.sort_values(by='committer_date').groupby('year').nth(0).reset_index()
    fig.update_xaxes(
        ticktext=axis['year'],
        tickvals=axis['revision']
    )

def log10_y_axis(fig):
    fig.update_yaxes(tickprefix = "10<sup>", ticksuffix = "</sup>")

def percentage_y_axis(fig):
    fig.layout.yaxis.tickformat = ',.0%'

def committer_date_labels(dict={}):
    return {'committer_date': 'Year / First Release in Year'} | dict

def revision_labels(dict={}):
    return {'revision': 'Year'} | dict

def style_legend(fig, position='topleft'):
    if position == 'topleft':
        fig.update_layout(legend=dict(yanchor='top', y=0.98, xanchor='left', x=0.01))
    elif position == 'topright':
        fig.update_layout(legend=dict(yanchor='top', y=0.98, xanchor='right', x=0.98))
    elif position == 'bottomright':
        fig.update_layout(legend=dict(yanchor='bottom', y=0.01, xanchor='right', x=0.98))
    elif position == 'bottomleft':
        fig.update_layout(legend=dict(yanchor='bottom', y=0.01, xanchor='left', x=0.01))
    else:
        fig.update_layout(showlegend=False)

def style_box(fig, legend_position='topleft'):
    fig.update_traces(fillcolor='rgba(0,0,0,0)')
    fig.update_traces(line_width=1)
    fig.update_traces(marker_size=2)
    style_legend(fig, legend_position)

def style_scatter(fig, marker_size=4, legend_position='topleft'):
    if marker_size:
        fig.update_traces(marker_size=marker_size)
    style_legend(fig, legend_position)

def plot_failures(fig, df, x, y, y_value, align='bottom', xref='x', font_size=10, textangle=270):
    group = df.groupby(x, dropna=False)
    failures = (group[y].size() - group[y].count()).reset_index().rename(columns={y: f'{y}_failures'})
    attempts = group[y].size().reset_index().rename(columns={y: f'{y}_attempts'})
    failures = pd.merge(failures, attempts)
    failures[f'{y}_text'] = failures[f'{y}_failures'].astype(str) + ' (' + (failures[f'{y}_failures'] / failures[f'{y}_attempts']).apply(lambda v: "{0:.1f}%".format(v * 100)) + ')'
    for row in range(len(failures)):
        text = failures.at[row, f'{y}_text']
        text = "" if failures.at[row, f'{y}_failures'] == 0 else text
        fig.add_annotation(
            x=failures.at[row, x],
            y=y_value,
            text=text,
            showarrow=False,
            font_size=font_size,
            textangle=textangle,
            align='left' if align == 'bottom' else 'right',
            yanchor='bottom' if align == 'bottom' else 'top',
            yshift=5 if align == 'bottom' else -5,
            font_color='gray',
            xref=xref
        )

def show(fig, name=None, width=1000, height=500, margin=None):
    fig.update_layout(width=width, height=height)
    if margin:
        fig.update_layout(margin=margin)
    else:
        fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    if figures_directory and name:
        fig.write_image(f'{figures_directory}/{name}.pdf')
    fig.show()

In [7]:
# differentiate kinds of features
# takes a while to run, so here's a loading mechanism
load_feature_data = False

potential_misses_grep = set()
potential_misses_kmax = set()

def jaccard(a, b):
    return len(set.intersection(a, b)) / len(set.union(a, b))

def add_features(descriptor, source, features):
    descriptor[f'#{source}'] = len(features) if len(features) > 1 else np.nan

def get_variables(variable_map):
    variables = set(variable_map.values())
    if len(variables) <= 1:
        variables = set()
    return variables

def inspect_architecture_features_for_model(extractor, revision, architecture, grep_features):
    global potential_misses_grep, potential_misses_kmax
    
    features_filename = f'{output_directory}/kconfig/{extractor}/linux/{revision}[{architecture}].features'
    with open(features_filename, 'r') as f:
        extractor_features = set([re.sub('^CONFIG_', '', f.strip()) for f in f.readlines()])
    
    unconstrained_features_filename = f'{output_directory}/unconstrained-features/{extractor}/linux/{revision}[{architecture}].unconstrained.features'
    unconstrained_feature_variables = set()
    if os.path.isfile(unconstrained_features_filename):
        with open(unconstrained_features_filename, 'r') as f:
            unconstrained_feature_variables = set([re.sub('^CONFIG_', '', f.strip()) for f in f.readlines()])

    dimacs_filename = f'{output_directory}/backbone-dimacs/{extractor}/linux/{revision}[{architecture}].backbone.dimacs'
    variables = set()
    natural_variables = set()
    all_feature_variables = set()
    core_feature_variables = set()
    dead_feature_variables = set()
    undead_feature_variables = set()
    feature_variables = set()
    features = set()
    jaccard_index = {'ext_vs_dimacs': np.nan, 'extractor_features_jaccard': np.nan, 'variables_jaccard': np.nan, 'natural_variables_jaccard': np.nan, \
                     'all_feature_variables_jaccard': np.nan, 'undead_feature_variables_jaccard': np.nan, 'feature_variables_jaccard': np.nan}
    if os.path.isfile(dimacs_filename):
        with open(dimacs_filename, 'r') as f:
            lines = f.readlines()
            variable_map = {}
            natural_variable_map = {}
            feature_variable_map = {}
            for f in lines:
                if f.startswith('c '):
                    result = re.search('^c ([^ ]+) ([^ ]+)$', f)
                    if result:
                        index = int(result.group(1).strip())
                        name = result.group(2).strip()
                        variable_map[index] = name
                        if "k!" not in name:
                            natural_variable_map[index] = name
                            if name != 'True' \
                                and name != '<unsupported>' \
                                and name != 'PREDICATE_Compare' \
                                and not name.startswith('__VISIBILITY__CONFIG_') \
                                and not name.endswith('_MODULE'):
                                feature_variable_map[index] = name
            variables = get_variables(variable_map)
            natural_variables = get_variables(natural_variable_map)
            all_feature_variables = get_variables(feature_variable_map)
            if len(all_feature_variables) > 0:
                for f in lines:
                    result = re.search('^([^ ]+) 0$', f)
                    if result:
                        literal = int(result.group(1))
                        index = abs(literal)
                        if index in feature_variable_map:
                            if literal > 0:
                                core_feature_variables.add(feature_variable_map[index])
                            else:
                                dead_feature_variables.add(feature_variable_map[index])
                undead_feature_variables = all_feature_variables.difference(dead_feature_variables)
                feature_variables = undead_feature_variables.union(unconstrained_feature_variables)
                features = feature_variables.intersection(grep_features)
                jaccard_index = {'extractor_features_jaccard': jaccard(extractor_features, features), \
                                 'variables_jaccard': jaccard(variables, features), \
                                 'natural_variables_jaccard': jaccard(natural_variables, features), \
                                 'all_feature_variables_jaccard': jaccard(all_feature_variables, features), \
                                 'undead_feature_variables_jaccard': jaccard(undead_feature_variables, features), \
                                 'feature_variables_jaccard': jaccard(feature_variables, features)}
    descriptor = {'extractor': extractor, 'revision': revision, 'architecture': architecture} | jaccard_index
    add_features(descriptor, 'grep_features', grep_features)
    add_features(descriptor, 'extractor_features', extractor_features)
    add_features(descriptor, 'unconstrained_feature_variables', unconstrained_feature_variables)
    add_features(descriptor, 'variables', variables)
    add_features(descriptor, 'natural_variables', natural_variables)
    add_features(descriptor, 'all_feature_variables', all_feature_variables)
    add_features(descriptor, 'core_feature_variables', core_feature_variables)
    add_features(descriptor, 'dead_feature_variables', dead_feature_variables)
    add_features(descriptor, 'undead_feature_variables', undead_feature_variables)
    add_features(descriptor, 'feature_variables', feature_variables)
    add_features(descriptor, 'features', features)
    add_features(descriptor, 'core_features', features.intersection(core_feature_variables))
    add_features(descriptor, 'unconstrained_variable_features', features.intersection(unconstrained_feature_variables))
    add_features(descriptor, 'constrained_variable_features', features.difference(core_feature_variables).difference(unconstrained_feature_variables))
    if extractor == 'kmax':
        potential_misses_grep.update([f for f in feature_variables.difference(features) if '__CONFIG_' not in f])
    return descriptor, all_feature_variables.union(unconstrained_feature_variables)

def inspect_architecture_features_for_revision(extractor, revision):
    grep_features = set(df_configs[df_configs['revision'] == revision]['config'])
    architectures = [re.search('\[(.*)\]', f).group(1) for f in glob.glob(f'{output_directory}/kconfig/{extractor}/linux/{revision}[*.features')]
    architectures = list(set(architectures))
    architectures.sort()
    data = []
    total_feature_variables = set()
    for architecture in architectures:
        descriptor, feature_variables = inspect_architecture_features_for_model(extractor, revision, architecture, grep_features)
        data.append(descriptor)
        if extractor == 'kmax':
            total_feature_variables.update(feature_variables)
    if extractor == 'kmax':
        potential_misses_kmax.update([f for f in grep_features.difference(total_feature_variables)])
    return data

def inspect_architecture_features(extractor):
    print(f'{extractor} ', end='')
    revisions = [re.search('linux/(.*)\[', f).group(1) for f in glob.glob(f'{output_directory}/kconfig/{extractor}/linux/*.features')]
    revisions = list(set(revisions))
    revisions.sort()
    data = []
    i = 0
    for revision in revisions:
        i += 1
        if i % 10 == 0:
            print(revision + ' . ', end='')
        data += inspect_architecture_features_for_revision(extractor, revision)
    print()
    return data

if load_feature_data:
    with open(f'{output_directory}/linux-features.dat', 'rb') as f:
        [features_by_kind_per_architecture, potential_misses_grep, potential_misses_kmax] = pickle.load(f)
else:
    features_by_kind_per_architecture = inspect_architecture_features('kconfigreader')
    features_by_kind_per_architecture += inspect_architecture_features('kmax')
    features_by_kind_per_architecture = pd.DataFrame(features_by_kind_per_architecture)
    with open(f'{output_directory}/linux-features.dat', 'wb') as f:
        pickle.dump([features_by_kind_per_architecture, potential_misses_grep, potential_misses_kmax], f)

replace_values(features_by_kind_per_architecture)
df_features = pd.merge(df_architectures, features_by_kind_per_architecture).sort_values(by='committer_date')
df_features = pd.merge(df_kconfig, df_features).sort_values(by='committer_date')

def compare_with_grep(message, list):
    print(f'{message}: ' + str(len(list)))
    print(pd.merge(df_configs[['config','kconfig-file']], pd.DataFrame(list, columns=['config']), how='inner') \
        .drop_duplicates().merge(df_config_types[['config', 'type']]).drop_duplicates())

# these are the features NOT found by grep, but found by kmax (this allows us to check whether the grep regex matches too much)
# the only matches are mistakes in kconfig files: IA64_SGI_UV (which has a trailing `) and SND_SOC_UX500_MACH_MOP500 (which has a leading +)
compare_with_grep('#potential misses (grep)', potential_misses_grep)
print()

# these are the features found by grep, but NOT found by kmax, either constrained or unconstrained (this allows us to check whether kmax matches enough)
# as there are some extraction failures for kmax, we expect some misses
compare_with_grep('#potential misses (kmax)', potential_misses_kmax)

kconfigreader v2.5.54 . 
kmax v2.5.54 . 
#potential misses (grep): 3
  config  kconfig-file    type
0   ARCH  init/Kconfig  string

#potential misses (kmax): 84
                config                   kconfig-file      type
0         ACER_PICA_61              arch/mips/Kconfig      bool
136       ACER_PICA_61       arch/mips/Kconfig-shared      bool
272       ACER_PICA_61         arch/mips/jazz/Kconfig      bool
408        ALGOR_P4032              arch/mips/Kconfig      bool
437         BAGET_MIPS              arch/mips/Kconfig      bool
...                ...                            ...       ...
14233  WATCHDOG_CP1XXX  drivers/char/watchdog/Kconfig  tristate
14369  WATCHDOG_CP1XXX       drivers/watchdog/Kconfig  tristate
14505     WATCHDOG_RIO           arch/sparc64/Kconfig  tristate
14641     WATCHDOG_RIO  drivers/char/watchdog/Kconfig  tristate
14777     WATCHDOG_RIO       drivers/watchdog/Kconfig  tristate

[165 rows x 3 columns]


In [113]:
# feature types

df_config_types_summary = df_configs.merge(df_config_types, how='outer').drop(columns=['system', 'revision', 'kconfig-file'])
for type in ['bool', 'hex', 'int', 'string', 'tristate']:
    ratio = len(df_config_types_summary[df_config_types_summary['type'] == type]) / len(df_config_types_summary)
    print(f'{type}: {round(ratio * 100, 3):,}%')
ratio = len(df_config_types_summary[df_config_types_summary['type'].isna()]) / len(df_config_types_summary)
print(f'unknown: {round(ratio * 100, 3):,}%')

bool: 50.057%
hex: 1.143%
int: 4.234%
string: 0.548%
tristate: 42.419%
unknown: 1.599%


In [26]:
# source lines of code

def sloc(trendline=None):
    return px.scatter(
        df_kconfig,
        x='committer_date',
        y='source_lines_of_code',
        trendline=trendline,
        labels={'source_lines_of_code': 'Number of Source Lines of Code', 'committer_date': 'Year'},
        hover_data=['revision']
    )

fig = sloc('ols')
estimate_trend(fig, 'SLOC')

fig = sloc()
style_scatter(fig)
show(fig, 'sloc', width=500, height=default_height)

\hspace*{1mm} SLOC & 2,670 & 81,282 & 975,399 & 2,697,428 & 22,205,415 \\ 


In [19]:
# processor architectures

fig = px.line(
    pd.DataFrame(columns=df_architectures.columns),
    x='committer_date',
    y='architecture',
    labels=committer_date_labels({'architecture': 'Processor Architecture'}),
    hover_data=['revision']
)

for architecture in df_architectures['architecture'].unique()[::-1]:
    df = df_architectures[df_architectures['architecture'] == architecture]
    # todo: min, max, sum?
    # solve = df_solve[~df_solve['model-count-log10'].isna()] \
    #     .groupby(['committer_date', 'architecture']) \
    #     .agg({'backbone.dimacs-analyzer-time': 'min'}) \
    #     .reset_index()
    # df = pd.merge(df, solve[['committer_date', 'architecture', 'backbone.dimacs-analyzer-time']])
    fig.add_trace(go.Scatter(
        x=df['committer_date'],
        y=df['architecture'],
        mode='markers',
        line_color='rgba(0,0,0,1)',
        marker_size=2, # todo: (df['backbone.dimacs-analyzer-time'] / 1000000000).apply(lambda s: max(1, np.log10(s))) * 3,
        showlegend=False
    ))

df_architectures_first_version = df_architectures.groupby('architecture').min().reset_index()
for row in range(len(df_architectures_first_version)):
    fig.add_annotation(
        x=df_architectures_first_version.at[row, 'committer_date'],
        y=df_architectures_first_version.at[row, 'architecture'],
        text=df_architectures_first_version.at[row, 'architecture'],
        showarrow=False, yshift=0, xshift=-5, font_size=10, xanchor='right', font_color='black'
    )

def add_information(fig, criterion, name, symbol, color):
    df = pd.merge(df_architectures, criterion[['committer_date', 'architecture']])
    fig.add_trace(go.Scatter(
        x=df['committer_date'],
        y=df['architecture'],
        name=name,
        mode='markers',
        marker_size=5, marker_color=color, marker_line_color=color, marker_symbol=symbol, marker_line_width=1
    ))
add_information(fig, df_features[df_features['#extractor_features'].isna()], 'Extraction Failure', 'square-open', '#1f77b4')
add_information(fig, df_features[df_features['extractor_features_jaccard'].isna() & (df_features['extractor'] == 'KConfigReader')], 'Unsatisfiable (KConfigReader)', 'line-ew', '#2ca02c')
add_information(fig, df_features[df_features['extractor_features_jaccard'].isna() & (df_features['extractor'] == 'KClause')], 'Unsatisfiable (KClause)', 'line-ns', '#2ca02c')
add_information(fig, df_solve[df_solve['model-count-log10'].isna() & (df_solve['extractor'] == 'KConfigReader')], '#SAT Timeout (KConfigReader)', 'line-nw', '#ff7f0e')
add_information(fig, df_solve[df_solve['model-count-log10'].isna() & (df_solve['extractor'] == 'KClause')], '#SAT Timeout (KClause)', 'line-ne', '#ff7f0e')
#todo: do not show model count if extraction failed
#todo: maybe add a STATE column which is 'extracted', 'unsat', 'un#sat' etc. ad mapping it onto a symbol
#todo: distinguish d4 and sharpsat
#todo: better symbols?

committer_date_x_axis(fig)
fig.update_yaxes(showticklabels=False)
style_scatter(fig, marker_size=None, legend_position='bottomleft')
show(fig, 'architectures')

In [131]:
# Jaccard similarity of configs and features
# configs = extractor features
# features = dimacs features

fig = px.box(
    df_features,
    x='year',
    y='extractor_features_jaccard',
    color='extractor',
    labels={'extractor_features_jaccard': 'Jaccard Similarity of Configs and Features', 'extractor': 'Extractor', 'year': 'Year'}
)
#percentage_y_axis(fig)
#style_box(fig, legend_position='bottomright')
#plot_failures(fig, df_features, 'year', 'extractor_features_jaccard', 0.99, align='top')
#show(fig, 'configs-vs-features', height=default_height, width=500)

show(px.box(
    df_features,
    x='year',
    y='ext_vs_dimacs',
    color='extractor',
    labels={'extractor_features_jaccard': 'Jaccard Similarity of Configs and Features', 'extractor': 'Extractor', 'year': 'Year'}
))

show(px.box(
    df_features,
    x='year',
    y='extractor_features_jaccard',
    color='extractor',
    labels={'extractor_features_jaccard': 'Jaccard Similarity of Configs and Features', 'extractor': 'Extractor', 'year': 'Year'}
))

show(px.box(
    df_features,
    x='year',
    y='variables_jaccard',
    color='extractor',
    labels={'extractor_features_jaccard': 'Jaccard Similarity of Configs and Features', 'extractor': 'Extractor', 'year': 'Year'}
))

show(px.box(
    df_features,
    x='year',
    y='natural_variables_jaccard',
    color='extractor',
    labels={'extractor_features_jaccard': 'Jaccard Similarity of Configs and Features', 'extractor': 'Extractor', 'year': 'Year'}
))

show(px.box(
    df_features,
    x='year',
    y='feature_variables_jaccard',
    color='extractor',
    labels={'extractor_features_jaccard': 'Jaccard Similarity of Configs and Features', 'extractor': 'Extractor', 'year': 'Year'}
))

show(px.box(
    df_features,
    x='year',
    y='undead_feature_variables_jaccard',
    color='extractor',
    labels={'extractor_features_jaccard': 'Jaccard Similarity of Configs and Features', 'extractor': 'Extractor', 'year': 'Year'}
))

In [69]:
# share of dead features

fig = px.box(
    df_features.assign(share_of_dead_features=df_features['#dead_dimacs_features'] / df_features['#dimacs_features']),
    x='year',
    y='share_of_dead_features',
    color='extractor',
    labels={'share_of_dead_features': 'Share of Dead Features', 'extractor': 'Extractor', 'year': 'Year'}
)
percentage_y_axis(fig)
style_box(fig)
plot_failures(fig, df_features, 'year', '#dead_dimacs_features', -0.05)
show(fig, 'share-of-dead-features', height=default_height, width=500)

In [70]:
# features

def estimate_features(df, y, name):
    fig = px.scatter(
        df,
        x='committer_date',
        y=y,
        trendline='ols',
        color='extractor'
    )
    estimate_group(name)
    estimate_trend(fig, '\\kcr', 0)
    estimate_trend(fig, '\\kcl', 1)

def plot_features(source):
    features_by_kind = df_features.groupby(['extractor', 'revision']) \
        .agg({f'#{source}_total_features': 'min', f'#{source}_common_features': 'min', f'#{source}_owned_features': 'sum'}).reset_index()
    features_by_kind[f'#{source}_shared_features'] = features_by_kind[f'#{source}_total_features'] \
        - features_by_kind[f'#{source}_common_features'] \
        - features_by_kind[f'#{source}_owned_features']
    features_by_kind = pd.merge(df_kconfig[['committer_date', 'revision']].drop_duplicates(), features_by_kind)

    estimate_features(
        df_features.groupby(['extractor', 'revision', 'committer_date']).min(f'\{source}_total_features').reset_index(),
        f'#{source}_total_features', 'total number')
    estimate_features(df_features, f'#{source}_features', 'any architecture')
    estimate_features(df_features[df_features['architecture'] == 'arm'], f'#{source}_features', 'arm architecture')
    estimate_features(df_features[(df_features['architecture'] == 'i386') | (df_features['architecture'] == 'x86')], f'#{source}_features', 'x86 architecture')

    fig = px.bar(
        features_by_kind.sort_values(by='committer_date'),
        x='revision',
        y=[f'#{source}_common_features', f'#{source}_shared_features', f'#{source}_owned_features'],
        labels=revision_labels({'value': 'Number of Features', 'variable': 'Feature Kind', 'extractor': 'Extractor'}),
        facet_col='extractor'
    )
    revision_x_axis(fig)
    style_legend(fig)
    show(fig, f'features_by_kind_{source}', height=default_height, margin=dict(l=0, r=0, t=20, b=0))

    fig = px.scatter(
        features_by_kind.sort_values(by='committer_date'),
        x='committer_date',
        y=f'#{source}_total_features',
        color='extractor',
        labels={f'#{source}_total_features': 'Total Number of Features', 'extractor': 'Extractor', 'committer_date': 'Year'}
    )
    style_scatter(fig)
    show(fig, f'total_features_{source}', height=default_height, width=500)

    fig = px.scatter(
        df_features,
        x='committer_date',
        y=f'#{source}_features',
        color='architecture',
        labels={f'#{source}_features': 'Number of Features', 'extractor': 'Extractor', 'committer_date': 'Year'},
        hover_data=['revision', 'architecture'],
        facet_col='extractor'
    )
    style_scatter(fig, legend_position=None)
    show(fig, f'features_by_architecture_{source}', height=default_height, margin=dict(l=0, r=0, t=20, b=0))

    fig = px.box(
        df_features,
        x='revision',
        y=f'#{source}_features',
        color='extractor',
        labels=revision_labels({f'#{source}_features': 'Number of Features', 'extractor': 'Extractor'}),
        boxmode='overlay'
    )
    revision_x_axis(fig)
    style_box(fig)
    show(fig, f'features_{source}', height=default_height)

print('configs')
plot_features('extractor')
print('undead features')
plot_features('dimacs_no_dead')

configs
\multicolumn{6}{l}{total number} \\
\hspace*{1mm} \kcr & 2 & 75 & 906 & 3,723 & 21,833 \\ 
\hspace*{1mm} \kcl & 4 & 109 & 1,311 & 4,513 & 30,723 \\ 
\multicolumn{6}{l}{any architecture} \\
\hspace*{1mm} \kcr & 3 & 99 & 1,183 & 2,445 & 26,104 \\ 
\hspace*{1mm} \kcl & 2 & 64 & 771 & 1,868 & 17,291 \\ 
\multicolumn{6}{l}{arm architecture} \\
\hspace*{1mm} \kcr & 3 & 103 & 1,231 & 3,078 & 27,689 \\ 
\hspace*{1mm} \kcl & 2 & 68 & 822 & 2,278 & 18,711 \\ 
\multicolumn{6}{l}{x86 architecture} \\
\hspace*{1mm} \kcr & 3 & 98 & 1,173 & 3,095 & 26,546 \\ 
\hspace*{1mm} \kcl & 2 & 64 & 766 & 2,290 & 17,610 \\ 


undead features
\multicolumn{6}{l}{total number} \\
\hspace*{1mm} \kcr & 2 & 72 & 860 & 3,406 & 20,607 \\ 
\hspace*{1mm} \kcl & 4 & 116 & 1,398 & 5,533 & 33,488 \\ 
\multicolumn{6}{l}{any architecture} \\
\hspace*{1mm} \kcr & 2 & 74 & 887 & 1,601 & 19,338 \\ 
\hspace*{1mm} \kcl & 1 & 43 & 511 & 899 & 11,111 \\ 
\multicolumn{6}{l}{arm architecture} \\
\hspace*{1mm} \kcr & 3 & 93 & 1,112 & 2,770 & 25,002 \\ 
\hspace*{1mm} \kcl & 2 & 56 & 667 & 1,663 & 15,009 \\ 
\multicolumn{6}{l}{x86 architecture} \\
\hspace*{1mm} \kcr & 3 & 85 & 1,015 & 2,738 & 23,047 \\ 
\hspace*{1mm} \kcl & 2 & 50 & 596 & 1,565 & 13,490 \\ 


In [71]:
# model count

def is_accurate(series):
    return len(set.difference(set(series), {pd.NA})) < 2

df_solve_inaccuracies = df_solve.groupby(['extractor', 'revision', 'architecture']).agg({'model-count': is_accurate})
df_solve_inaccuracies = df_solve_inaccuracies.dropna()
print('number of inaccurate model counts: ' + str(len(df_solve_inaccuracies[~df_solve_inaccuracies['model-count']])))

def solver_successes(solver):
    df_solve_for_solver = df_solve[~df_solve['model-count'].isna()]
    df_solve_for_solver = df_solve_for_solver[df_solve_for_solver['backbone.dimacs-analyzer'] == solver]
    return set(df_solve_for_solver['extractor'] + ',' + df_solve_for_solver['revision'] + ',' + df_solve_for_solver['architecture'])

d4_successes = solver_successes('model-counting-competition-2022/d4.sh')
sharpsat_successes = solver_successes('model-counting-competition-2022/SharpSAT-td+Arjun/SharpSAT-td+Arjun.sh')

print('number of model counts only found by d4: ' + str(len(d4_successes.difference(sharpsat_successes))))
print('number of model counts only found by sharpsat: ' + str(len(sharpsat_successes.difference(d4_successes))))
print('number of model counts found by both: ' + str(len(d4_successes.intersection(sharpsat_successes))))

def big_sum(series):
    big_sum = sum([int(value) for value in series if not pd.isna(value) and value])
    if big_sum > 0:
        return len(str(big_sum))

df_solve_slice = df_solve[df_solve['year'] <= 2013]
df_solve_group = df_solve_slice.groupby(['extractor', 'revision'], dropna=False)
df_solve_failures = (df_solve_group['model-count-log10'].size() - df_solve_group['model-count-log10'].count()).reset_index()
df_solve_failures['is-upper-bound'] = df_solve_failures['model-count-log10'] == 0
df_solve_failures = df_solve_failures.rename(columns={'model-count-log10': 'failures'})
df_solve_total = pd.merge(df_solve_slice, df_solve_failures)
df_solve_total = df_solve_total.groupby(['extractor', 'committer_date']).agg({'model-count': big_sum, 'is-upper-bound': 'min', 'failures': 'min'}).reset_index()

def estimate_configurations(df, y, name, swapped=False):
    fig = px.scatter(
        df,
        x='committer_date',
        y=y,
        trendline='ols',
        color='extractor'
    )
    estimate_group(name)
    estimate_trend(fig, '\\kcr', 1 if swapped else 0)
    estimate_trend(fig, '\\kcl', 0 if swapped else 1)

estimate_configurations(df_solve_total[df_solve_total['is-upper-bound'] == True], 'model-count', 'total number', True)
estimate_configurations(df_solve, 'model-count-log10', 'any architecture')
estimate_configurations(df_solve[df_solve['architecture'] == 'arm'],'model-count-log10', 'arm architecture')
estimate_configurations(df_solve[(df_solve['architecture'] == 'i386') | (df_solve['architecture'] == 'x86')], 'model-count-log10', 'x86 architecture')

fig = px.scatter(
    df_solve_slice,
    x='committer_date',
    y='model-count-log10',
    color='architecture',
    labels={'model-count-log10': 'Number of Configurations (log<sub>10</sub>)', 'committer_date': 'Year', 'extractor': 'Extractor'},
    hover_data=['revision', 'architecture'],
    facet_col='extractor'
)
log10_y_axis(fig)
style_scatter(fig, legend_position=None)
# plot_failures(fig, df_solve_slice[df_solve_slice['extractor'] == 'kconfigreader'], 'committer_date', 'model-count-log10', 0, align='bottom', xref='x', font_size=8)
# plot_failures(fig, df_solve_slice[df_solve_slice['extractor'] == 'kmax'], 'committer_date', 'model-count-log10', 0, align='bottom', xref='x2', font_size=8)
show(fig, 'model-count', height=default_height, margin=dict(l=0, r=0, t=20, b=0))

fig = px.box(
    df_solve_slice.sort_values(by='committer_date'),
    x='revision',
    y='model-count-log10',
    color='extractor',
    labels=revision_labels({'model-count-log10': 'Number of Configurations (log<sub>10</sub>)', 'extractor': 'Extractor'}),
    hover_data=['revision', 'architecture']
)
revision_x_axis(fig)
log10_y_axis(fig)
style_box(fig)
# plot_failures(fig, df_solve_slice, 'revision', 'model-count-log10', 0, align='bottom')
show(fig, height=default_height)

fig = px.scatter(
    df_solve_total.replace(True, 'Exact').replace(False, 'Lower Bound'),
    x='committer_date',
    y='model-count',
    color='extractor',
    symbol='is-upper-bound',
    symbol_sequence=['circle', 'triangle-up-open'],
    # size=df_solve_total['failures'].apply(lambda f: f if f > 0 else 1),
    labels=revision_labels({'model-count': 'Total Number of Configurations (log<sub>10</sub>)', 'extractor': 'Extractor', 'is-upper-bound': 'Kind of Bound', 'committer_date': 'Year'})
)
log10_y_axis(fig)
style_scatter(fig, legend_position='topright') #, marker_size=None)
fig.update_traces(marker_line_color='rgba(0,0,0,0)')
show(fig, 'model-count-total', height=default_height, width=500)

number of inaccurate model counts: 0
number of model counts only found by d4: 18
number of model counts only found by sharpsat: 205
number of model counts found by both: 2043
\multicolumn{6}{l}{total number} \\
\hspace*{1mm} \kcr & 0 & 7 & 83 & 683 & 2,352 \\ 
\hspace*{1mm} \kcl & 0 & 5 & 58 & 444 & 1,598 \\ 
\multicolumn{6}{l}{any architecture} \\
\hspace*{1mm} \kcr & 0 & 2 & 19 & 512 & 886 \\ 
\hspace*{1mm} \kcl & 0 & 2 & 26 & 343 & 866 \\ 
\multicolumn{6}{l}{arm architecture} \\
\hspace*{1mm} \kcr & 0 & 6 & 68 & 632 & 1,988 \\ 
\hspace*{1mm} \kcl & 0 & 4 & 51 & 410 & 1,426 \\ 
\multicolumn{6}{l}{x86 architecture} \\
\hspace*{1mm} \kcr & 0 & 7 & 87 & 684 & 2,432 \\ 
\hspace*{1mm} \kcl & 0 & 5 & 62 & 444 & 1,692 \\ 


In [72]:
# model count time

df_solve_slice = df_solve[~df_solve['model-count-log10'].isna()]
fig = px.scatter(
    df_solve_slice,
    x=df_solve_slice['committer_date'],
    y=df_solve_slice['backbone.dimacs-analyzer-time'] / 1000000000,
    color='architecture',
    labels={'extractor': 'Extractor', 'y': 'Time for Counting (log<sub>10</sub> s)', 'committer_date': 'Year'},
    facet_col='extractor',
    #facet_row='backbone.dimacs-analyzer',
    log_y=True
)
style_scatter(fig, legend_position=None)
show(fig, 'model-count-time', height=default_height, margin=dict(l=0, r=0, t=20, b=0))

In [29]:
df_configs_sum = df_configs.copy().drop_duplicates()
df_configs_sum['#configs'] = 1
df_configs_sum = df_configs_sum.groupby(['revision']).agg({'#configs': sum}).reset_index()
df_configs_sum = df_configs_sum.merge(df_architectures[['revision', 'committer_date']].drop_duplicates())
df_configs_sum

px.scatter(df_configs_sum, x='committer_date', y='#configs')