In [1]:
%%capture
pip install plotly pandas statsmodels kaleido scipy nbformat jinja2

In [2]:
# read CSV data

import glob
import re
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd
import numpy as np
import os.path
import pickle
import scipy
from statistics import mean, stdev
from math import sqrt, log10
from packaging.version import Version

output_directory = '../output-linux-2024-11-08'
figures_directory = '../../paper-tosem-2024-linux/tosem-2024/figures'
default_height = 270

pio.templates['colorblind'] = go.layout.Template(layout_colorway=['#648FFF', '#FE6100', '#785EF0', '#DC267F', '#FFB000'])
pio.templates.default = 'plotly_white+colorblind'

def read_dataframe(stage, dtype={}, usecols=None, file=None):
    if not file:
        file = 'output'
    df = pd.read_csv(f'{output_directory}/{stage}/{file}.csv', dtype=dtype, usecols=usecols)
    if 'committer_date_unix' in df:
        df['committer_date'] = df['committer_date_unix'].apply(lambda d: pd.to_datetime(d, unit='s'))
    return df

def replace_values(df):
    df.replace('kconfigreader', 'KConfigReader', inplace=True)
    df.replace('kmax', 'KClause', inplace=True)

def big_log10(str):
    return log10(int(str)) if not pd.isna(str) and str != '' else pd.NA

def process_model_count(df_solve):
    df_solve['model-count'] = df_solve['model-count'].replace('1', '')
    df_solve['model-count-log10'] = df_solve['model-count'].fillna('').apply(big_log10).replace(0, np.nan)
    df_solve['year'] = df_solve['committer_date'].apply(lambda d: int(d.year))

def peek_dataframe(df, column, message, type='str', filter=['revision', 'architecture', 'extractor']):
    success = df[~df[column].str.contains('NA') if type == 'str' else ~df[column].isna()][filter]
    failure = df[df[column].str.contains('NA') if type == 'str' else df[column].isna()][filter]
    print(f'{message}: {len(success)} successes, {len(failure)} failures')

df_architectures = read_dataframe(f'read-linux-architectures')
df_architectures = df_architectures.sort_values(by='committer_date')
df_architectures['year'] = df_architectures['committer_date'].apply(lambda d: int(d.year))

df_configs = read_dataframe(f'read-linux-configs')
df_configs = df_configs[~df_configs['kconfig-file'].str.contains('/um/')]

df_config_types = read_dataframe(f'read-linux-configs', file='output.types')
df_config_types = df_config_types[~df_config_types['kconfig-file'].str.contains('/um/')]
df_config_types = df_config_types.merge(df_architectures[['revision', 'committer_date']].drop_duplicates())

df_kconfig = read_dataframe('kconfig')
df_kconfig['year'] = df_kconfig['committer_date'].apply(lambda d: int(d.year))

df_uvl = read_dataframe('model_to_uvl_featureide')
df_smt = read_dataframe('model_to_smt_z3')
df_dimacs = read_dataframe('dimacs')
df_backbone_dimacs = read_dataframe('backbone-dimacs')

df_solve = read_dataframe('solve_model-count', {'model-count': 'string'})
process_model_count(df_solve)

if os.path.isfile(f'{output_directory}/model-count-with-6h-timeout.csv'):
    df_solve_6h = pd.read_csv(f'{output_directory}/model-count-with-6h-timeout.csv', dtype={'model-count': 'string'})
    df_solve_6h = df_backbone_dimacs.merge(df_solve_6h)
    process_model_count(df_solve_6h)
    df_solve = pd.merge(df_solve, df_solve_6h[['revision','architecture', 'extractor', 'backbone.dimacs-analyzer']], indicator=True, how='outer') \
        .query('_merge=="left_only"') \
        .drop('_merge', axis=1)
    df_solve = pd.concat([df_solve, df_solve_6h])
else:
    df_solve_6h = None

for df in [df_kconfig, df_uvl, df_smt, df_dimacs, df_backbone_dimacs, df_solve]:
    replace_values(df)

peek_dataframe(df_kconfig, 'model-file', 'model extraction')
peek_dataframe(df_uvl, 'uvl-file', 'UVL transformation', 'na', ['model-file'])
peek_dataframe(df_smt, 'smt-file', 'SMT transformation', 'na', ['model-file'])
peek_dataframe(df_dimacs, 'dimacs-file', 'CNF transformation')
peek_dataframe(df_backbone_dimacs, 'backbone.dimacs-file', 'backbone transformation', 'na')
peek_dataframe(df_solve, 'model-count-log10', 'model counting', 'na')

model extraction: 6572 successes, 46 failures
UVL transformation: 6572 successes, 0 failures
SMT transformation: 6572 successes, 0 failures
CNF transformation: 6572 successes, 0 failures
backbone transformation: 6496 successes, 76 failures
model counting: 4260 successes, 8732 failures


In [3]:
# helper functions for drawing plots

def estimate_group(group):
    print('\\hspace{2mm} ' + group + ' \\\\')

def estimate_trend(fig, color=None, color_value=None, xs=[], key=lambda x: x.timestamp()):
    results = px.get_trendline_results(fig)
    if color is not None and color_value is not None:
        idx = [i for i, r in enumerate(results.iloc) if r[color] == color_value][0]
    else:
        idx = 0
    intercept = results.iloc[idx]['px_fit_results'].params[0]
    slope = results.iloc[idx]['px_fit_results'].params[1]
    daily = slope * pd.to_timedelta(1, unit='D').total_seconds()
    weekly = slope * pd.to_timedelta(7, unit='D').total_seconds()
    monthly = slope * pd.to_timedelta(1, unit='D').total_seconds() * 30.437
    yearly = slope * pd.to_timedelta(1, unit='D').total_seconds() * 365.25
    return daily, weekly, monthly, yearly, [intercept + slope * key(x) for x in xs]

def committer_date_x_axis(fig, df=df_kconfig, append_revision=True, step=1):
    axis = df_kconfig[['committer_date', 'revision']].drop_duplicates()
    axis['year'] = axis['committer_date'].apply(lambda d: str(d.year))
    axis = axis.sort_values(by='committer_date').groupby('year').nth(0).reset_index()
    fig.update_xaxes(
        ticktext=axis['year'].str.cat('<br><sup>' + axis['revision'].str[1:] + '</sup>')[1::step] if append_revision else axis['year'][::step],
        tickvals=axis['year'][1::step]
    )

def revision_x_axis(fig, df=df_kconfig):
    axis = df_kconfig[['committer_date', 'revision']].drop_duplicates()
    axis['year'] = axis['committer_date'].apply(lambda d: str(d.year))
    axis = axis.sort_values(by='committer_date').groupby('year').nth(0).reset_index()
    fig.update_xaxes(
        ticktext=axis['year'],
        tickvals=axis['revision']
    )

def log10_y_axis(fig):
    fig.update_yaxes(tickprefix = "10<sup>", ticksuffix = "</sup>")

def percentage_y_axis(fig):
    fig.layout.yaxis.tickformat = ',.0%'

def format_percentage(value):
    return str(round(value * 100, 2)) + '%'

def committer_date_labels(dict={}):
    return {'committer_date': 'Year<br><sup>First Release in Year</sup>'} | dict

def revision_labels(dict={}):
    return {'revision': 'Year'} | dict

def style_legend(fig, position='topleft', xshift=0, yshift=0):
    if position == 'topleft':
        fig.update_layout(legend=dict(yanchor='top', y=0.98 + yshift, xanchor='left', x=0.01 + xshift))
    elif position == 'topright':
        fig.update_layout(legend=dict(yanchor='top', y=0.98 + yshift, xanchor='right', x=0.98 + xshift))
    elif position == 'bottomright':
        fig.update_layout(legend=dict(yanchor='bottom', y=0.01 + yshift, xanchor='right', x=0.98 + xshift))
    elif position == 'bottomleft':
        fig.update_layout(legend=dict(yanchor='bottom', y=0.01 + yshift, xanchor='left', x=0.01 + xshift))
    else:
        fig.update_layout(showlegend=False)

def style_box(fig, legend_position='topleft', xshift=0, yshift=0):
    fig.update_traces(fillcolor='rgba(0,0,0,0)')
    fig.update_traces(line_width=1)
    fig.update_traces(marker_size=2)
    fig.update_layout(font_family="Linux Biolinum")
    style_legend(fig, legend_position, xshift, yshift)

def style_scatter(fig, marker_size=4, legend_position='topleft', xshift=0, yshift=0):
    if marker_size:
        fig.update_traces(marker_size=marker_size)
    style_legend(fig, legend_position, xshift, yshift)
    fig.update_layout(font_family="Linux Biolinum")

def plot_failures(fig, df, x, y, y_value, align='bottom', xref='x', font_size=10, textangle=270):
    group = df.groupby(x, dropna=False)
    failures = (group[y].size() - group[y].count()).reset_index().rename(columns={y: f'{y}_failures'})
    attempts = group[y].size().reset_index().rename(columns={y: f'{y}_attempts'})
    failures = pd.merge(failures, attempts)
    failures[f'{y}_text'] = failures[f'{y}_failures'].astype(str) + ' (' + (failures[f'{y}_failures'] / failures[f'{y}_attempts']).apply(lambda v: "{0:.1f}%".format(v * 100)) + ')'
    for row in range(len(failures)):
        text = failures.at[row, f'{y}_text']
        text = "" if failures.at[row, f'{y}_failures'] == 0 else text
        fig.add_annotation(
            x=failures.at[row, x],
            y=y_value,
            text=text,
            showarrow=False,
            font_size=font_size,
            textangle=textangle,
            align='left' if align == 'bottom' else 'right',
            yanchor='bottom' if align == 'bottom' else 'top',
            yshift=5 if align == 'bottom' else -5,
            font_color='gray',
            xref=xref
        )

def cohens_d(d1, d2):
    # uses pooled standard deviation
    n1, n2 = len(d1), len(d2)
    s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
    s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
    u1, u2 = np.mean(d1), np.mean(d2)
    return (u1 - u2) / s

def wilcoxon_test(df, column_a, column_b):
    # if the same values are returned for many inputs, refer to https://stats.stackexchange.com/q/232927
    a = df[column_a][~df[column_a].isna()]
    b = df[column_b][~df[column_b].isna()]
    d = a - b
    results = scipy.stats.wilcoxon(d, method='approx')
    p = results.pvalue
    # adapted from https://stats.stackexchange.com/q/133077
    r = np.abs(results.zstatistic / np.sqrt(len(d) * 2))
    return p, r

def style_p_values(fig, brackets, scale=0, _format=dict(interline=0.07, text_height=1.07, color='gray')):
    # adapted from https://stackoverflow.com/q/67505252
    for entry in brackets:
        first_column, second_column, y, results = entry
        y_range = [1.01+y*_format['interline'], 1.02+y*_format['interline']]
        p, r = results
        if p >= 0.05:
            symbol = 'ns'
        elif p >= 0.01: 
            symbol = '*'
        elif p >= 0.001:
            symbol = '**'
        else:
            symbol = '***'
        first_column = first_column - scale
        second_column = second_column + scale
        fig.add_shape(type="line",
            xref="x", yref="y domain",
            x0=first_column, y0=y_range[0],
            x1=first_column, y1=y_range[1],
            line=dict(color=_format['color'], width=2,)
        )
        fig.add_shape(type="line",
            xref="x", yref="y domain",
            x0=first_column, y0=y_range[1], 
            x1=second_column, y1=y_range[1],
            line=dict(color=_format['color'], width=2,)
        )
        fig.add_shape(type="line",
            xref="x", yref="y domain",
            x0=second_column, y0=y_range[0], 
            x1=second_column, y1=y_range[1],
            line=dict(color=_format['color'], width=2,)
        )
        fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
            x=(first_column + second_column)/2,
            y=y_range[1]*_format['text_height'],
            showarrow=False,
            text=symbol + ' <sup>(' + str(round(r, 2)) + ')</sup>',
            textangle=0,
            xref="x",
            yref="y domain"
        ))
    return fig

def bracket_for(i, j, xshift, y, results):
    return [i + xshift, j + xshift, y, results]

def filter_extractor(df, extractor):
    return df[df['extractor'] == extractor]

def annotate_value(fig, x, y, subplot, prefix, ax, ay, xanchor, df, fn=lambda prefix, y: prefix + ': ' + format(round(y), ',') if y > 0 else prefix):
    if isinstance(x, str):
        x = df[x].iat[0]
    if isinstance(y, str):
        y = df[y].iat[0]
    fig.add_annotation(
        xref='x' + str(subplot),
        yref='y' + str(subplot),
        x=x,
        y=y,
        ax=ax,
        ay=ay,
        xanchor=xanchor,
        text=fn(prefix, y)
    )

def show(fig, name=None, width=1000, height=500, margin=None):
    fig.update_layout(width=width, height=height)
    if margin:
        fig.update_layout(margin=margin)
    else:
        fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    if figures_directory and os.path.isdir(figures_directory) and name:
        fig.write_image(f'{figures_directory}/{name}.pdf')
    fig.show()

In [4]:
# differentiate kinds of features

potential_misses_grep = set()
potential_misses_kmax = set()
extractor_comparison = {}
df_configs_configurable = df_configs.copy()
df_configs_configurable['configurable'] = False

def jaccard(a, b):
    return len(set.intersection(a, b)) / len(set.union(a, b))

def add_features(descriptor, source, features, min=2):
    descriptor[f'#{source}'] = len(features) if features is not None and len(features) >= min else np.nan

def get_variables(variable_map):
    variables = set(variable_map.values())
    if len(variables) <= 1:
        variables = set()
    return variables

def read_unconstrained_feature_variables(extractor, revision, architecture):
    unconstrained_features_filename = f'{output_directory}/unconstrained-features/{extractor}/linux/{revision}[{architecture}].unconstrained.features'
    unconstrained_feature_variables = set()
    if os.path.isfile(unconstrained_features_filename):
        with open(unconstrained_features_filename, 'r') as f:
            unconstrained_feature_variables = set([re.sub('^CONFIG_', '', f.strip()) for f in f.readlines()])
    return unconstrained_feature_variables

def inspect_architecture_features_for_model(extractor, revision, architecture, config_features, features_for_last_revision):
    global potential_misses_grep, potential_misses_kmax
    
    features_filename = f'{output_directory}/kconfig/{extractor}/linux/{revision}[{architecture}].features'
    with open(features_filename, 'r') as f:
        extracted_features = set([re.sub('^CONFIG_', '', f.strip()) for f in f.readlines()])
    
    unconstrained_feature_variables = read_unconstrained_feature_variables(extractor, revision, architecture)

    dimacs_filename = f'{output_directory}/backbone-dimacs/{extractor}/linux/{revision}[{architecture}].backbone.dimacs'
    all_variables = set()
    variables = set()
    feature_variables = set()
    core_feature_variables = set()
    dead_feature_variables = set()
    undead_feature_variables = set()
    all_feature_variables = set()
    features = set()
    core_features = set()
    unconstrained_features = set()
    constrained_features = set()
    added_features = None
    removed_features = None
    infos = {'extracted_features_jaccard': np.nan, \
                     'all_variables_jaccard': np.nan, \
                     'variables_jaccard': np.nan, \
                     'feature_variables_jaccard': np.nan, \
                     'undead_feature_variables_jaccard': np.nan, \
                     'all_feature_variables_jaccard': np.nan, \
                     'features_jaccard': np.nan, \
                     'unconstrained_bools': np.nan, \
                     'unconstrained_tristates': np.nan}
    
    if os.path.isfile(dimacs_filename):
        with open(dimacs_filename, 'r') as f:
            lines = f.readlines()
            all_variable_map = {}
            variable_map = {}
            feature_variable_map = {}
            for f in lines:
                if f.startswith('c '):
                    result = re.search('^c ([^ ]+) ([^ ]+)$', f)
                    if result:
                        index = int(result.group(1).strip())
                        name = result.group(2).strip()
                        all_variable_map[index] = name
                        if "k!" not in name:
                            variable_map[index] = name
                            if name != 'True' \
                                and name != '<unsupported>' \
                                and name != 'PREDICATE_Compare' \
                                and not name.startswith('__VISIBILITY__CONFIG_') \
                                and not name.endswith('_MODULE'):
                                feature_variable_map[index] = name
            all_variables = get_variables(all_variable_map)
            variables = get_variables(variable_map)
            feature_variables = get_variables(feature_variable_map)

            backbone_features_filename = f'{output_directory}/backbone-features/{extractor}/linux/{revision}[{architecture}].backbone.features'
            if os.path.isfile(backbone_features_filename):
                with open(backbone_features_filename, 'r') as f:
                    lines = f.readlines()
                    if len(lines) > 1:
                        core_feature_variables = set([line[1:].strip() for line in lines if line.startswith('+')]).intersection(feature_variables)
                        dead_feature_variables = set([line[1:].strip() for line in lines if line.startswith('-')]).intersection(feature_variables)

            if len(feature_variables) > 0:
                undead_feature_variables = feature_variables.difference(dead_feature_variables)
                all_feature_variables = undead_feature_variables.union(unconstrained_feature_variables)
                features = all_feature_variables.intersection(config_features)
                if f'{revision}###{architecture}' not in extractor_comparison:
                    extractor_comparison[f'{revision}###{architecture}'] = features
                else:
                    extractor_comparison[f'{revision}###{architecture}'] = jaccard(extractor_comparison[f'{revision}###{architecture}'], features)
                core_features = features.intersection(core_feature_variables)
                unconstrained_features = features.intersection(unconstrained_feature_variables)
                unconstrained_features_by_type = pd.DataFrame(list(unconstrained_features), columns=['config']) \
                    .merge(df_config_types[(df_config_types['revision'] == revision)])
                unconstrained_bools = unconstrained_features_by_type[unconstrained_features_by_type['type'] == 'bool']['config'].drop_duplicates()
                unconstrained_tristates = unconstrained_features_by_type[unconstrained_features_by_type['type'] == 'tristate']['config'].drop_duplicates()
                constrained_features = features.difference(core_feature_variables).difference(unconstrained_feature_variables)
                if architecture in features_for_last_revision and len(features_for_last_revision[architecture]) > 0:
                    added_features = features.difference(features_for_last_revision[architecture])
                    removed_features = features_for_last_revision[architecture].difference(features)
                infos = { \
                            'extracted_features_jaccard': jaccard(extracted_features, features), \
                            'all_variables_jaccard': jaccard(all_variables, features), \
                            'variables_jaccard': jaccard(variables, features), \
                            'feature_variables_jaccard': jaccard(feature_variables, features), \
                            'undead_feature_variables_jaccard': jaccard(undead_feature_variables, features), \
                            'all_feature_variables_jaccard': jaccard(all_feature_variables, features), \
                            'features_jaccard': 1, \
                            'unconstrained_bools': len(unconstrained_bools), \
                            'unconstrained_tristates': len(unconstrained_tristates) \
                        }
    descriptor = {'extractor': extractor, 'revision': revision, 'architecture': architecture} | infos
    add_features(descriptor, 'config_features', config_features) # F_config
    add_features(descriptor, 'extracted_features', extracted_features) # F_extracted
    add_features(descriptor, 'unconstrained_feature_variables', unconstrained_feature_variables, min=1) # F_unconstrained
    add_features(descriptor, 'all_variables', all_variables) # V_all
    add_features(descriptor, 'variables', variables) # V_phi
    add_features(descriptor, 'feature_variables', feature_variables) # FV_phi
    add_features(descriptor, 'core_feature_variables', core_feature_variables, min=1) # FV_core
    add_features(descriptor, 'dead_feature_variables', dead_feature_variables, min=1) # FV_dead
    add_features(descriptor, 'constrained_feature_variables', undead_feature_variables.difference(core_feature_variables)) # FV_constrained
    add_features(descriptor, 'undead_feature_variables', undead_feature_variables) # FV_undead
    add_features(descriptor, 'all_feature_variables', all_feature_variables) # FV
    add_features(descriptor, 'ALL_feature_variables', feature_variables.union(unconstrained_feature_variables)) # FV_all
    add_features(descriptor, 'features', features) # F
    add_features(descriptor, 'core_features', core_features, min=1)
    add_features(descriptor, 'unconstrained_features', unconstrained_features, min=1)
    add_features(descriptor, 'constrained_features', constrained_features)
    add_features(descriptor, 'added_features', added_features, min=0)
    add_features(descriptor, 'removed_features', removed_features, min=0)
    if extractor == 'kmax':
        potential_misses_grep.update([f for f in all_feature_variables.difference(features) if '__CONFIG_' not in f])
    return descriptor, feature_variables.union(unconstrained_feature_variables), features

def inspect_architecture_features_for_revision(extractor, revision, features_for_last_revision):
    config_features = set(df_configs[df_configs['revision'] == revision]['config'])
    architectures = [re.search('\[(.*)\]', f).group(1) for f in glob.glob(f'{output_directory}/kconfig/{extractor}/linux/{revision}[*.features')]
    architectures = list(set(architectures))
    architectures.sort()
    data = []
    total_features = set()
    total_feature_variables = set()
    features_for_current_revision = {}
    for architecture in architectures:
        descriptor, feature_variables, features = inspect_architecture_features_for_model(extractor, revision, architecture, config_features, features_for_last_revision)
        data.append(descriptor)
        total_features.update(features)
        features_for_current_revision[architecture] = features
        if extractor == 'kmax':
            total_feature_variables.update(feature_variables)
    for descriptor in data:
        add_features(descriptor, 'total_features', total_features)
        total_added_features = None
        total_removed_features = None
        if 'TOTAL' in features_for_last_revision and len(features_for_last_revision['TOTAL']) > 0:
            total_added_features = total_features.difference(features_for_last_revision['TOTAL'])
            total_removed_features = features_for_last_revision['TOTAL'].difference(total_features)
        add_features(descriptor, 'total_added_features', total_added_features, min=0)
        add_features(descriptor, 'total_removed_features', total_removed_features, min=0)
    features_for_current_revision['TOTAL'] = total_features
    df_configs_configurable.loc[(df_configs_configurable['revision'] == revision) & (df_configs_configurable['config'].isin(total_features)), 'configurable'] = True
    if extractor == 'kmax':
        potential_misses_kmax.update([f for f in config_features.difference(total_feature_variables)])
    return data, features_for_current_revision

def inspect_architecture_features(extractor):
    print(f'{extractor} ', end='')
    revisions = [re.search('/linux/(.*)\[', f).group(1) for f in glob.glob(f'{output_directory}/kconfig/{extractor}/linux/*.features')]
    revisions = list(set(revisions))
    revisions.sort(key=Version)
    data = []
    features_for_last_revision = {}
    i = 0
    for revision in revisions:
        i += 1
        if i % 10 == 0:
            print(revision + ' . ', end='')
        new_data, features_for_last_revision = inspect_architecture_features_for_revision(extractor, revision, features_for_last_revision)
        data += new_data
    print()
    return data

if os.path.isfile(f'{output_directory}/linux-features.dat'):
    with open(f'{output_directory}/linux-features.dat', 'rb') as f:
        [features_by_kind_per_architecture, df_extractor_comparison, potential_misses_grep, potential_misses_kmax, df_configs_configurable] = pickle.load(f)
else:
    features_by_kind_per_architecture = inspect_architecture_features('kconfigreader')
    features_by_kind_per_architecture += inspect_architecture_features('kmax')
    features_by_kind_per_architecture = pd.DataFrame(features_by_kind_per_architecture)
    df_extractor_comparison = []
    for key, value in extractor_comparison.items():
        [revision, architecture] = key.split('###')
        if type(value) is set:
            value = pd.NA
        df_extractor_comparison.append({'revision': revision, 'architecture': architecture, 'extractor_jaccard': value})
    df_extractor_comparison = pd.DataFrame(df_extractor_comparison)
    with open(f'{output_directory}/linux-features.dat', 'wb') as f:
        pickle.dump([features_by_kind_per_architecture, df_extractor_comparison, potential_misses_grep, potential_misses_kmax, df_configs_configurable], f)

replace_values(features_by_kind_per_architecture)
df_features = pd.merge(df_architectures, features_by_kind_per_architecture, how='outer').sort_values(by='committer_date')
df_features = pd.merge(df_kconfig, df_features, how='outer').sort_values(by='committer_date')

def compare_with_grep(message, list):
    print(f'{message}: ' + str(len(list)))
    print(pd.merge(df_configs[['config','kconfig-file']], pd.DataFrame(list, columns=['config']), how='inner') \
        .drop_duplicates().merge(df_config_types[['config', 'type']]).drop_duplicates())

def report_potential_misses(potential_misses_grep, potential_misses_kmax):
    # these are the features NOT found by grep, but found by kmax (this allows us to check whether the grep regex matches too much)
    # the only matches are enviroment variables (e.g., ARCH) and mistakes in kconfig files: IA64_SGI_UV (which has a trailing `) and SND_SOC_UX500_MACH_MOP500 (which has a leading +)
    compare_with_grep('#potential misses (grep)', potential_misses_grep)
    print()

    # these are the features found by grep, but NOT found by kmax, either constrained or unconstrained (this allows us to check whether kmax matches enough)
    # as there are some extraction failures for kmax, we expect some misses; also, we do not extract the um architecture; and finally, there are some test kconfig files that are never included
    # in the following, we try to filter out these effects (this is not perfect though)
    potential_misses_kmax_with_type = (pd.merge(df_configs[['config','kconfig-file', 'revision']], pd.DataFrame(potential_misses_kmax, columns=['config']), how='inner') \
            .drop_duplicates().merge(df_config_types[['config', 'type']]).drop_duplicates())
    misses_due_to_tests = set(potential_misses_kmax_with_type[ \
            potential_misses_kmax_with_type['kconfig-file'].str.startswith('Documentation/') | \
            potential_misses_kmax_with_type['kconfig-file'].str.startswith('scripts/')]['config'].unique())
    missing_kmax_models = df_features[(df_features['extractor'] == 'KClause') & df_features['#extracted_features'].isna()]
    missing_kmax_models = missing_kmax_models[['revision', 'architecture']].drop_duplicates()
    potential_misses_kmax_with_type['architecture'] = potential_misses_kmax_with_type['kconfig-file'].apply(lambda s: re.sub(r'^arch/(.*?)/.*$', r'\1', s))
    potential_misses_due_to_missing_kmax_models = set(potential_misses_kmax_with_type.merge(missing_kmax_models[['revision', 'architecture']].drop_duplicates()) \
                                                    .drop(columns=['kconfig-file', 'revision', 'architecture', 'type'])['config'].unique())
    potential_misses_kmax = potential_misses_kmax.difference(misses_due_to_tests).difference(potential_misses_due_to_missing_kmax_models)
    # the remaining matches are due to our way of using kmax extractor, where we ignore lines with new kconfig constructs like $(success,...)
    compare_with_grep('#potential misses (kmax)', potential_misses_kmax)

report_potential_misses(potential_misses_grep, potential_misses_kmax)

#potential misses (grep): 7
                        config             kconfig-file      type
0                         ARCH             init/Kconfig    string
53               KERNELVERSION             init/Kconfig    string
106                IA64_SGI_UV        arch/ia64/Kconfig      bool
187  SND_SOC_UX500_MACH_MOP500  sound/soc/ux500/Kconfig  tristate

#potential misses (kmax): 21
                                config                        kconfig-file  \
0                      MIPS_FPE_MODULE                 arch/mips64/Kconfig   
29                      BLK_DEV_FD1772         drivers/acorn/block/Kconfig   
83                         BLK_DEV_MFM         drivers/acorn/block/Kconfig   
137             BLK_DEV_MFM_AUTODETECT         drivers/acorn/block/Kconfig   
191                      VIRTEX_II_PRO      arch/ppc/platforms/4xx/Kconfig   
207                      VIRTEX_II_PRO  arch/powerpc/platforms/4xx/Kconfig   
223                          DRAM_BASE              arch/arm/Kconf

In [5]:
# source lines of code

def sloc(trendline=None):
    return px.scatter(
        df_kconfig,
        x='committer_date',
        y='source_lines_of_code',
        trendline=trendline,
        labels={'source_lines_of_code': 'Number of Source Lines of Code', 'committer_date': 'Year'},
        hover_data=['revision']
    )

fig = sloc('ols')
print(estimate_trend(fig))

fig = sloc()
style_scatter(fig)
show(fig, 'sloc', width=500, height=default_height)

(2800.169271362282, 19601.184899535972, 85228.75211245377, 1022761.8263650734, [])


In [6]:
# statistics in section 6.2

print(len(df_architectures))
print(len(df_architectures['revision'].drop_duplicates()))
print(len(df_architectures['architecture'].drop_duplicates()))
print(df_architectures.groupby('revision').agg({'architecture': len})['architecture'].median())
print(len(df_features[df_features['#extracted_features'].isna()&(df_features['extractor']=='KConfigReader')]))
print(len(df_features[df_features['#extracted_features'].isna()&(df_features['extractor']=='KClause')]))
print(len(df_features[~df_features['#extracted_features'].isna()&df_features['extracted_features_jaccard'].isna()&(df_features['extractor'] == 'KConfigReader')]))
print(len(df_features[~df_features['#extracted_features'].isna()&df_features['extracted_features_jaccard'].isna()&(df_features['extractor'] == 'KClause')]))
print(wilcoxon_test(df_features, 'extracted_features_jaccard', 'features_jaccard'))
print(wilcoxon_test(df_features, 'all_variables_jaccard', 'features_jaccard'))
print(wilcoxon_test(df_features, 'variables_jaccard', 'features_jaccard'))
print(wilcoxon_test(df_features, 'feature_variables_jaccard', 'features_jaccard'))
print(wilcoxon_test(df_features, 'undead_feature_variables_jaccard', 'features_jaccard'))
print(wilcoxon_test(df_features, 'all_feature_variables_jaccard', 'features_jaccard'))

3309
144
45
22.0
23
23
93
0
(0.0, 0.6123960626475493)
(0.0, 0.6123960625884561)
(0.0, 0.612396062637419)
(0.0, 0.6123960626678098)
(0.0, 0.6123960630443176)
(0.0, 0.6123960638598033)


In [7]:
# Jaccard similarity to features (RQ2)

print('extractor comparison:')
print('min=' + str(df_extractor_comparison['extractor_jaccard'].min()))
print('median=' + str(df_extractor_comparison['extractor_jaccard'].median()))
print('max=' + str(df_extractor_comparison['extractor_jaccard'].max()))

df_features_long = pd.melt(
    df_features,
    id_vars=['extractor'],
    value_vars=['extracted_features_jaccard', 'all_variables_jaccard', 'variables_jaccard', \
                'feature_variables_jaccard', 'undead_feature_variables_jaccard', 'all_feature_variables_jaccard', \
                'features_jaccard']
)
df_features_long.replace({'variable': 'extracted_features_jaccard'}, 'F<sub>extracted</sub>', inplace=True)
df_features_long.replace({'variable': 'all_variables_jaccard'}, 'V<sub>all</sub>', inplace=True)
df_features_long.replace({'variable': 'variables_jaccard'}, 'V', inplace=True)
df_features_long.replace({'variable': 'feature_variables_jaccard'}, 'FV', inplace=True)
df_features_long.replace({'variable': 'undead_feature_variables_jaccard'}, 'FV<sub>undead</sub>', inplace=True)
df_features_long.replace({'variable': 'all_feature_variables_jaccard'}, 'F<sub>all</sub>', inplace=True)
df_features_long.replace({'variable': 'features_jaccard'}, 'F', inplace=True)

fig = px.box(
    df_features_long,
    x='variable',
    y='value',
    range_y=[0, 1],
    color='extractor',
    facet_col='extractor',
    labels={'value': 'Jaccard Similarity to Features (F)', 'variable': 'Set of Candidate Features', 'extractor': 'Extractor'},
    category_orders={'variable': ['F<sub>extracted</sub>', 'V<sub>all</sub>', 'V', 'FV', 'FV<sub>undead</sub>', 'F<sub>all</sub>', 'F'],
                     'extractor': ['KConfigReader', 'KClause']},
)
fig.update_traces(width=0.5)
percentage_y_axis(fig)
style_box(fig, legend_position=None)
show(fig, 'features-jaccard', height=default_height, width=600, margin=dict(l=0, r=0, t=20, b=0))

extractor comparison:
min=0.882466281310212
median=0.9745712596096984
max=0.9931818181818182


In [8]:
# similarity of configurations (RQ2)

df_solve_unconstrained = df_solve.merge(df_features)
df_solve_unconstrained['model-count-unconstrained'] = \
    df_solve_unconstrained.apply(lambda row: \
        str(int(row['model-count']) * (2**int(row['unconstrained_bools'])) * (3**int(row['unconstrained_tristates']))) \
            if not pd.isna(row['model-count']) and row['model-count'] != '' else pd.NA, axis=1)
df_solve_unconstrained['model-count-unconstrained-log10'] = df_solve_unconstrained['model-count-unconstrained'].fillna('').map(big_log10).replace(0, np.nan)
df_solve_unconstrained['similarity'] = df_solve_unconstrained.apply(lambda row: int(row['model-count']) / int(row['model-count-unconstrained']) if not pd.isna(row['model-count']) and row['model-count'] != '' else pd.NA, axis=1)

df_solve_extractor_comparison = pd.pivot(df_solve_unconstrained[['revision', 'architecture', 'extractor', 'model-count-unconstrained-log10']].dropna().drop_duplicates(), index=['revision', 'architecture'], columns='extractor').dropna()
df_solve_extractor_comparison = df_solve_extractor_comparison['model-count-unconstrained-log10']['KConfigReader'] / df_solve_extractor_comparison['model-count-unconstrained-log10']['KClause']

print('extractor comparison:')
print('min=' + str(df_solve_extractor_comparison.min()))
print('median=' + str(df_solve_extractor_comparison.median()))
print('max=' + str(df_solve_extractor_comparison.max()))

fig = px.box(
    df_solve_unconstrained,
    y='similarity',
    color='extractor',
    facet_col='extractor',
    labels={'similarity': 'Ratio of #C<sub>min</sub> to #C (log10)'},
    log_y=True,
    category_orders={'extractor': ['KConfigReader', 'KClause']}
)
fig.for_each_annotation(lambda a: a.update(text='KCR' if a.text.split("=")[1] == 'KConfigReader' else 'KCl'))
fig.update_traces(width=0.5)
fig.update_yaxes(tickvals=[1e0, 1e-3, 1e-6, 1e-9, 1e-12, 1e-15], ticktext=['10<sup>0</sup>', '10<sup>-3</sup>', '10<sup>-6</sup>', '10<sup>-9</sup>', '10<sup>-12</sup>', '10<sup>-15</sup>'])
style_box(fig, legend_position=None)
show(fig, 'configuration-similarity', height=260, width=120, margin=dict(l=0, r=0, t=20, b=0))

extractor comparison:
min=1.3486978614482281
median=1.495475551941293
max=1.598371287383397


In [9]:
# share of all feature variables

df_features_long = pd.melt(
    df_features[~df_features['#features'].isna()].assign(**{
        '#dead_feature_variables': df_features['#dead_feature_variables'] / df_features['#ALL_feature_variables'],
        '#core_feature_variables': df_features['#core_feature_variables'] / df_features['#ALL_feature_variables'],
        '#constrained_feature_variables': df_features['#constrained_feature_variables'] / df_features['#ALL_feature_variables'],
        '#unconstrained_feature_variables': df_features['#unconstrained_feature_variables'] / df_features['#ALL_feature_variables'],
    }),
    id_vars=['extractor'],
    value_vars=['#dead_feature_variables', '#core_feature_variables', '#constrained_feature_variables', '#unconstrained_feature_variables'],
)
df_features_long.replace({'variable': '#dead_feature_variables'}, 'FV<sub>dead</sub>', inplace=True)
df_features_long.replace({'variable': '#core_feature_variables'}, 'FV<sub>core</sub>', inplace=True)
df_features_long.replace({'variable': '#constrained_feature_variables'}, 'FV<sub>constrained</sub>', inplace=True)
df_features_long.replace({'variable': '#unconstrained_feature_variables'}, 'F<sub>unconstrained</sub>', inplace=True)

fig = px.box(
    df_features_long,
    x='variable',
    y='value',
    range_y=[0, 1],
    color='extractor',
    labels={'value': 'Share of All Feature Variables (FV<sub>all</sub>)', 'variable': 'Level of Feature Configurability', 'extractor': 'Extractor'},
    category_orders={'variable': ['FV<sub>core</sub>', 'FV<sub>dead</sub>', 'F<sub>unconstrained</sub>', 'FV<sub>constrained</sub>'],
                     'extractor': ['KConfigReader', 'KClause']}
)
percentage_y_axis(fig)
style_box(fig, legend_position='topleft')
show(fig, 'share-of-feature-variables', height=default_height, width=500)


In [10]:
# classification of feature types
# looks at _all_ configuration options (over all revisions and architectures)
# the kind is unconstrained if both kconfigreader and kclause did not constrain it in at least one revision and architecture (so if a tool found some constraint, we treat it as constrained)
# the kind is core/dead if it was core/dead for kconfigreader or kclause in at least one revision and architecture (so we detect if one tool finds more constraints that make a feature dead)
# there are no features that are always core/dead
# half of all features are dead in some revision and architecture, so choosing the right revision and architecture matters
# one third on the other side is somewhat choosable in every formula in which they occur

df_config_types_statistics = df_configs.merge(df_config_types, how='outer').drop(columns=['system', 'revision', 'committer_date', 'kconfig-file']).drop_duplicates()

def read_unconstrained_config_features(extractor):
    config_features = set(df_config_types_statistics['config'])
    revision_architectures = [re.search('linux/(.*)\]', f).group(1) for f in glob.glob(f'{output_directory}/unconstrained-features/{extractor}/linux/*.unconstrained.features')]
    revision_architectures = list(set(revision_architectures))
    revision_architectures.sort()
    unconstrained_features = set()
    for revision_architecture in revision_architectures:
        [revision, architecture] = revision_architecture.split('[')
        unconstrained_features.update(read_unconstrained_feature_variables(extractor, revision, architecture).intersection(config_features))
    return unconstrained_features

def read_backbone_features(extractor):
    config_features = set(df_config_types_statistics['config'])
    revision_architectures = [re.search('linux/(.*)\]', f).group(1) for f in glob.glob(f'{output_directory}/backbone-features/{extractor}/linux/*.backbone.features')]
    revision_architectures = list(set(revision_architectures))
    revision_architectures.sort()
    core_features = set()
    dead_features = set()
    for revision_architecture in revision_architectures:
        [revision, architecture] = revision_architecture.split('[')
        backbone_features_filename = f'{output_directory}/backbone-features/{extractor}/linux/{revision}[{architecture}].backbone.features'
        if os.path.isfile(backbone_features_filename):
            with open(backbone_features_filename, 'r') as f:
                lines = f.readlines()
                if len(lines) > 1:
                    core_features.update(set([line[1:].strip() for line in lines if line.startswith('+')]))
                    dead_features.update(set([line[1:].strip() for line in lines if line.startswith('-')]))
    return core_features.intersection(config_features), dead_features.intersection(config_features)

core_features_kconfigreader, dead_features_kconfigreader = read_backbone_features('kconfigreader')
core_features_kmax, dead_features_kmax = read_backbone_features('kmax')

df_unconstrained_features = read_unconstrained_config_features('kconfigreader')
df_unconstrained_features = df_unconstrained_features.intersection(read_unconstrained_config_features('kmax'))
df_unconstrained_features = pd.DataFrame(df_unconstrained_features, columns=['config'])

df_core_features = core_features_kconfigreader.union(core_features_kmax)
df_core_features = pd.DataFrame(df_core_features, columns=['config'])
df_dead_features = dead_features_kconfigreader.union(dead_features_kmax)
df_dead_features = pd.DataFrame(df_dead_features, columns=['config'])

df_config_types_statistics = pd.merge(df_config_types_statistics, df_unconstrained_features, on=['config'], how='left', indicator='unconstrained')
df_config_types_statistics = pd.merge(df_config_types_statistics, df_core_features, on=['config'], how='left', indicator='core')
df_config_types_statistics = pd.merge(df_config_types_statistics, df_dead_features, on=['config'], how='left', indicator='dead')
df_config_types_statistics['unconstrained'] = np.where(df_config_types_statistics['unconstrained'] == 'both', 1, 0)
df_config_types_statistics['core'] = np.where(df_config_types_statistics['core'] == 'both', 1, 0)
df_config_types_statistics['dead'] = np.where(df_config_types_statistics['dead'] == 'both', 1, 0)
df_config_types_statistics['config'] = 1
df_config_types_statistics['constrained'] = df_config_types_statistics['config'] - df_config_types_statistics['unconstrained'] - df_config_types_statistics['core'] - df_config_types_statistics['dead']
df_config_types_statistics.loc[df_config_types_statistics['constrained'] < 0, ['constrained']] = 0
df_config_types_statistics['type'] = df_config_types_statistics['type'].replace(pd.NA, 'unknown')
df_config_types_statistics = df_config_types_statistics.groupby('type').agg({'config': 'sum', 'unconstrained': 'sum', 'constrained': 'sum', 'core': 'sum', 'dead': 'sum'}).reset_index()
df_config_types_statistics = df_config_types_statistics.sort_values(by=['config'], ascending=False)
df_config_types_statistics['ratio'] = (df_config_types_statistics['config'] / df_config_types_statistics.agg({'config': 'sum'}).iat[0]).apply(lambda v: "      " + format_percentage(v))
df_config_types_statistics = pd.melt(
    df_config_types_statistics,
    id_vars=['type', 'ratio'],
    value_vars=['config', 'core', 'dead', 'unconstrained', 'constrained']
)
df_config_types_statistics.loc[df_config_types_statistics['variable'] != 'config', ['ratio']] = None
df_config_types_total = df_config_types_statistics[df_config_types_statistics['variable'] == 'config']

fig = px.bar(
    df_config_types_statistics,
    x='type',
    y='value',
    range_y=[0, 16500],
    color='variable',
    labels={'type': 'Feature Type', 'value': 'Total Number of Features', 'variable': 'Configurability'},
    text='ratio',
    barmode='group'
)
fig.update_traces(textposition='outside', textangle=0, textfont_size=12, cliponaxis=False)
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='show')
style_legend(fig, 'topright')
show(fig, 'feature-types', width=500, height=200)

In [11]:
# features (RQ3.1)

def evaluate_features(df, extractor, date, y, median=True):
    rows = df[(df['extractor'] == extractor) & (df['committer_date'] >= date)].sort_values(by='committer_date')
    if len(rows) > 0:
        if median:
            return rows[rows['committer_date']==rows.iloc[0]['committer_date']][y].median()
        else:
            return rows.iloc[0][y]

def print_evaluation(fig, df, y, extractor, label, growth_prefix='', prefix='', postfix=''):
    dates = [pd.Timestamp('2004-01-01'), pd.Timestamp('2014-01-01'), pd.Timestamp('2024-01-01'), pd.Timestamp('2034-01-01'), pd.Timestamp('2044-01-01')]
    daily, weekly, monthly, yearly, estimated_values = estimate_trend(fig, 'extractor', extractor, dates)
    values = []
    for (date, estimated_value) in zip(dates, estimated_values):
        actual_value = evaluate_features(df, extractor, date, y)
        if actual_value:
            values.append(f"{{\\color{{gray}}(${prefix}\\text{{{round(actual_value):,}}}{postfix}$)}}")
        else:
            values.append(f"${prefix}\\text{{{round(estimated_value):,}}}{postfix}$")
    sign = lambda x: '\color{green}' if round(x) > 0 else ('\color{red}' if round(x) < 0 else '')
    print('\hspace*{4mm} ' + f'{label} & ${sign(daily)}{growth_prefix}{prefix}\\text{{{round(daily):,}}}{postfix}$ & ${sign(weekly)}{growth_prefix}{prefix}\\text{{{round(weekly):,}}}{postfix}$ & ${sign(monthly)}{growth_prefix}{prefix}\\text{{{round(monthly):,}}}{postfix}$ & ${sign(yearly)}{growth_prefix}{prefix}\\text{{{round(yearly):,}}}{postfix}$ ' + " ".join([f"& {value}" for value in values]) + ' \\\\')

def estimate_features(df, y, name):
    fig = px.scatter(
        df,
        x='committer_date',
        y=y,
        trendline='ols',
        color='extractor'
    )
    estimate_group(name)
    for (extractor, label) in [('KConfigReader', '\\kcr'), ('KClause', '\\kcl')]:
        print_evaluation(fig, df, y, extractor, label, '+\,')

def pearson_r(extractor, df):
    if len(set(df['#total_features'])) >= 1 and len([f for f in set(df['#total_features']) if pd.isna(f)]) == 0:
        s = scipy.stats.pearsonr(df['committer_date'].astype(int) // 10 ** 9, df['#total_features'])
        print(f'pearson for {extractor}: ' + str(round(s.statistic, 2)) + ', ' + str(round(s.pvalue, 2)))

df_total_features = df_features.groupby(['extractor', 'revision']).agg({'#total_features': 'min'}).reset_index()
df_total_features = pd.merge(df_kconfig[['committer_date', 'revision']].drop_duplicates(), df_total_features)

estimate_features(df_features.groupby(['extractor', 'revision', 'committer_date']).min('#total_features').reset_index(), f'#total_features', 'Total')
estimate_features(df_features[df_features['architecture'] == 'arm'], '#features', '\\arch{arm}')
estimate_features(df_features[(df_features['architecture'] == 'i386') | (df_features['architecture'] == 'x86')], '#features', '\\arch{x86}')

pearson_r('KConfigReader', df_total_features[df_total_features['extractor']=='KConfigReader'])
pearson_r('KClause', df_total_features[df_total_features['extractor']=='KClause'])

for extractor in ['KConfigReader', 'KClause']:
    for arch in set(df_architectures['architecture'].drop_duplicates()):
        pearson_r(f'{arch} ({extractor})', df_features[(df_features['extractor']==extractor)&(df_features['architecture']==arch)])

print('#features per architecture (KConfigReader):')
print('min=' + str(df_features[df_features['extractor']=='KConfigReader']['#features'].min()))
print('median=' + str(df_features[df_features['extractor']=='KConfigReader']['#features'].median()))
print('max=' + str(df_features[df_features['extractor']=='KConfigReader']['#features'].max()))

print('#features per architecture (KClause):')
print('min=' + str(df_features[df_features['extractor']=='KClause']['#features'].min()))
print('median=' + str(df_features[df_features['extractor']=='KClause']['#features'].median()))
print('max=' + str(df_features[df_features['extractor']=='KClause']['#features'].max()))

df = df_features[df_features['extractor']=='KClause']['#features']/df_features[df_features['extractor']=='KClause']['#total_features']
print('number of architecture features per total features (KClause):')
print('min=' + str(df.min()))
print('median=' + str(df.median()))
print('max=' + str(df.max()))

fig = px.scatter(
    df_total_features.sort_values(by='committer_date'),
    x='committer_date',
    y='#total_features',
    facet_col='extractor',
    labels={'#total_features': '#Features (Total)', 'extractor': 'Extractor', 'committer_date': 'Year'},
    category_orders={'extractor': ['KConfigReader', 'KClause']}
)
style_scatter(fig)
fn = lambda prefix, y: format(round(y), ',')
annotate_value(fig, 'committer_date', 0, 1, 'v2.5.45', 0, -15, 'center', df_features[df_features['revision'] == 'v2.5.45'])
annotate_value(fig, 'committer_date', 0, 1, 'v6.11', -10, -15, 'center', df_features[df_features['revision'] == 'v6.11'])
annotate_value(fig, 'committer_date', '#total_features', 1, 'KConfigReader', 40, 0, 'left',
               df_features[(df_features['extractor'] == 'KConfigReader')&(df_features['revision'] == 'v2.5.45')], fn)
annotate_value(fig, 'committer_date', '#total_features', 1, 'KConfigReader', -10, 30, 'right',
               df_features[(df_features['extractor'] == 'KConfigReader')&(df_features['revision'] == 'v6.11')], fn)
annotate_value(fig, 'committer_date', 0, 2, 'v2.5.45', 0, -15, 'center', df_features[df_features['revision'] == 'v2.5.45'])
annotate_value(fig, 'committer_date', 0, 2, 'v6.11', -10, -15, 'center', df_features[df_features['revision'] == 'v6.11'])
annotate_value(fig, 'committer_date', '#total_features', 2, 'KClause', 40, 0, 'left',
               df_features[(df_features['extractor'] == 'KClause')&(df_features['revision'] == 'v2.5.45')], fn)
annotate_value(fig, 'committer_date', '#total_features', 2, 'KClause', -10, 30, 'right',
               df_features[(df_features['extractor'] == 'KClause')&(df_features['revision'] == 'v6.11')], fn)
fig.update_yaxes(tickprefix = "   ")
fig.update_xaxes(range=["2002-01-01", "2024-12-01"])
fig.update_yaxes(range=[0, 20500])
show(fig, 'total-features', height=220, width=750, margin=dict(l=0, r=0, t=20, b=0))

fig = px.scatter(
    df_features,
    x='committer_date',
    y=f'#features',
    color='architecture',
    labels={f'#features': '#Features (Arch.)', 'extractor': 'Extractor', 'committer_date': 'Year'},
    hover_data=['revision', 'architecture'],
    facet_col='extractor',
    category_orders={'extractor': ['KConfigReader', 'KClause']}
)
style_scatter(fig, legend_position=None, marker_size=2.5)
annotate_value(fig, 'committer_date', 0, 1, 'v4.16', 0, -15, 'center', df_features[df_features['revision'] == 'v4.16'])
annotate_value(fig, 'committer_date', 0, 1, 'v6.11', -10, -15, 'center', df_features[df_features['revision'] == 'v6.11'])
annotate_value(fig, 'committer_date', '#features', 1, 'arm', -10, -20, 'right',
               df_features[(df_features['extractor'] == 'KConfigReader')&(df_features['architecture'] == 'arm')&(df_features['revision'] == 'v6.11')],)
annotate_value(fig, 'committer_date', '#features', 1, 'x86', -100, -20, 'right',
               df_features[(df_features['extractor'] == 'KConfigReader')&(df_features['architecture'] == 'x86')&(df_features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 1, 'arm64', -120, 0, 'right',
               df_features[(df_features['extractor'] == 'KConfigReader')&(df_features['architecture'] == 'arm64')&(df_features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 1, 'nios2', -5, 40, 'right',
               df_features[(df_features['extractor'] == 'KConfigReader')&(df_features['architecture'] == 'nios2')&(df_features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 1, 'score', 20, 0, 'left',
               df_features[(df_features['extractor'] == 'KConfigReader')&(df_features['architecture'] == 'score')&(df_features['revision'] == 'v4.16')])
annotate_value(fig, 'committer_date', 0, 2, 'v4.16', 0, -15, 'center', df_features[df_features['revision'] == 'v4.16'])
annotate_value(fig, 'committer_date', 0, 2, 'v6.11', -10, -15, 'center', df_features[df_features['revision'] == 'v6.11'])
annotate_value(fig, 'committer_date', '#features', 2, 'arm', -10, -20, 'right',
               df_features[(df_features['extractor'] == 'KClause')&(df_features['architecture'] == 'arm')&(df_features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 2, 'x86', -100, -20, 'right',
               df_features[(df_features['extractor'] == 'KClause')&(df_features['architecture'] == 'x86')&(df_features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 2, 'arm64', -120, 0, 'right',
               df_features[(df_features['extractor'] == 'KClause')&(df_features['architecture'] == 'arm64')&(df_features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 2, 'nios2', -5, 40, 'right',
               df_features[(df_features['extractor'] == 'KClause')&(df_features['architecture'] == 'nios2')&(df_features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 2, 'score', 20, 0, 'left',
               df_features[(df_features['extractor'] == 'KClause')&(df_features['architecture'] == 'score')&(df_features['revision'] == 'v4.16')])
fig.update_yaxes(tickprefix = "    ")
fig.update_xaxes(range=["2002-01-01", "2024-12-01"])
fig.update_yaxes(range=[0, 21000])
show(fig, 'features', height=220, width=750)

\hspace{2mm} Total \\
\hspace*{4mm} \kcr & $\color{green}+\,\text{2}$ & $\color{green}+\,\text{16}$ & $\color{green}+\,\text{69}$ & $\color{green}+\,\text{825}$ & {\color{gray}($\text{3,489}$)} & {\color{gray}($\text{12,938}$)} & {\color{gray}($\text{19,444}$)} & $\text{28,710}$ & $\text{36,962}$ \\
\hspace*{4mm} \kcl & $\color{green}+\,\text{2}$ & $\color{green}+\,\text{16}$ & $\color{green}+\,\text{70}$ & $\color{green}+\,\text{845}$ & {\color{gray}($\text{3,490}$)} & {\color{gray}($\text{13,520}$)} & {\color{gray}($\text{19,621}$)} & $\text{29,361}$ & $\text{37,806}$ \\
\hspace{2mm} \arch{arm} \\
\hspace*{4mm} \kcr & $\color{green}+\,\text{2}$ & $\color{green}+\,\text{13}$ & $\color{green}+\,\text{55}$ & $\color{green}+\,\text{662}$ & {\color{gray}($\text{2,108}$)} & {\color{gray}($\text{8,479}$)} & {\color{gray}($\text{15,036}$)} & $\text{21,927}$ & $\text{28,550}$ \\
\hspace*{4mm} \kcl & $\color{green}+\,\text{2}$ & $\color{green}+\,\text{13}$ & $\color{green}+\,\text{55}$ & $\col

In [12]:
# model count (RQ3.2)

df_solve_attempts = df_solve_unconstrained.merge(df_features[~df_features['#features'].isna()][['revision','architecture', 'extractor']].drop_duplicates())

def number_of_models(df):
    return len(df[['revision','architecture', 'extractor']].drop_duplicates())

def unify_solvers(df, columns=['model-count-unconstrained-log10']):
    return df[['revision', 'committer_date', 'architecture', 'extractor', *columns]].drop_duplicates()

def is_accurate(series):
    return len(set.difference(set(series), {pd.NA})) < 2

def solver_successes(solver):
    df_solve_for_solver = df_solve_attempts[~df_solve_attempts['model-count'].isna()]
    df_solve_for_solver = df_solve_for_solver[df_solve_for_solver['backbone.dimacs-analyzer'] == solver]
    return set(df_solve_for_solver['extractor'] + ',' + df_solve_for_solver['revision'] + ',' + df_solve_for_solver['architecture'])

def big_sum(series):
    big_sum = sum([int(value) for value in series if not pd.isna(value) and value])
    if big_sum > 0:
        return len(str(big_sum))
    
df_solve_inaccuracies = df_solve_unconstrained.groupby(['extractor', 'revision', 'architecture']).agg({'model-count': is_accurate})
df_solve_inaccuracies = df_solve_inaccuracies.dropna()
d4_successes = solver_successes('model-counting-competition-2022/d4.sh')
sharpsat_successes = solver_successes('model-counting-competition-2022/SharpSAT-td+Arjun/SharpSAT-td+Arjun.sh')
print('number of model-count attempts: ' + str(number_of_models(df_solve_attempts)))
print('number of inaccurate model counts: ' + str(len(df_solve_inaccuracies[~df_solve_inaccuracies['model-count']])))
print('number of model counts found: ' + str(number_of_models(df_solve_attempts[~df_solve_attempts['model-count'].isna()])))
print('number of model counts only found by d4: ' + str(len(d4_successes.difference(sharpsat_successes))))
print('number of model counts only found by sharpsat: ' + str(len(sharpsat_successes.difference(d4_successes))))
print('number of model counts found by both: ' + str(len(d4_successes.intersection(sharpsat_successes))))

df_solve_slice = df_solve_unconstrained[df_solve_unconstrained['year'] <= 2013]
df_solve_failures = df_solve_slice.groupby(['extractor', 'revision', 'architecture'], dropna=False).agg({'model-count-unconstrained-log10': lambda x: (True in list(pd.notna(x)) or pd.NA)}).reset_index()
df_solve_group = df_solve_failures.groupby(['extractor', 'revision'], dropna=False)
df_solve_failures = (df_solve_group['model-count-unconstrained-log10'].size() - df_solve_group['model-count-unconstrained-log10'].count()).reset_index()
df_solve_failures['is-upper-bound'] = df_solve_failures['model-count-unconstrained-log10'] == 0
df_solve_failures = df_solve_failures.rename(columns={'model-count-unconstrained-log10': 'failures'})
df_solve_total = unify_solvers(pd.merge(df_solve_slice, df_solve_failures), ['model-count-unconstrained', 'model-count-unconstrained-log10', 'is-upper-bound', 'failures', 'year'])
df_solve_total = df_solve_total.groupby(['extractor', 'committer_date', 'year']).agg({'model-count-unconstrained': big_sum, 'is-upper-bound': 'min', 'failures': 'min'}).reset_index()

print('#model counts per architecture (KConfigReader):')
print('min=' + str(unify_solvers(df_solve_slice[df_solve_slice['extractor']=='KConfigReader'])['model-count-unconstrained-log10'].min()))
print('median=' + str(unify_solvers(df_solve_slice[df_solve_slice['extractor']=='KConfigReader'])['model-count-unconstrained-log10'].median()))
print('max=' + str(unify_solvers(df_solve_slice[df_solve_slice['extractor']=='KConfigReader'])['model-count-unconstrained-log10'].max()))

print('#model counts per architecture (KClause):')
print('min=' + str(unify_solvers(df_solve_slice[df_solve_slice['extractor']=='KClause'])['model-count-unconstrained-log10'].min()))
print('median=' + str(unify_solvers(df_solve_slice[df_solve_slice['extractor']=='KClause'])['model-count-unconstrained-log10'].median()))
print('max=' + str(unify_solvers(df_solve_slice[df_solve_slice['extractor']=='KClause'])['model-count-unconstrained-log10'].max()))

df = unify_solvers(df_solve_slice[df_solve_slice['extractor']=='KClause'], ['#features', 'model-count-unconstrained']).apply(lambda row: \
    (int(row['model-count-unconstrained']) / (2 ** int(row['#features']))) \
        if not pd.isna(row['model-count-unconstrained']) and row['model-count-unconstrained'] != '' else pd.NA, axis=1)
print('degree of variability:')
print('min=' + str(df.min()))
print('median=' + str(df.median()))
print('max=' + str(df.max()))

def pearson_r(extractor, df, column):
    df = df.dropna()
    if len(df['committer_date']) >= 2:
        s = scipy.stats.pearsonr(df['committer_date'].astype(int) // 10 ** 9, df[column])
        print(f'pearson for {extractor}: ' + str(round(s.statistic, 2)) + ', ' + str(round(s.pvalue, 2)))

pearson_r('KConfigReader', df_solve_total[(df_solve_total['extractor']=='KConfigReader')&(df_solve_total['is-upper-bound'])], 'model-count-unconstrained')
pearson_r('KClause', df_solve_total[(df_solve_total['extractor']=='KClause')&(df_solve_total['is-upper-bound'])], 'model-count-unconstrained')

for arch in set(df_architectures['architecture'].drop_duplicates()):
    pearson_r(f'{arch}', unify_solvers(df_solve_slice[(df_solve_slice['extractor']=='KClause')&(df_solve_slice['architecture']==arch)]), 'model-count-unconstrained-log10')

def estimate_configurations(df, y, name):
    fig = px.scatter(
        df,
        x='committer_date',
        y=y,
        trendline='ols',
        color='extractor'
    )
    estimate_group(name)
    for (extractor, label) in [('KConfigReader', '\\kcr'), ('KClause', '\\kcl')]:
        print_evaluation(fig, df, y, extractor, label, '\cdot\,', '10^{', '}')

estimate_configurations(df_solve_total[df_solve_total['is-upper-bound'] == True], 'model-count-unconstrained', 'Total')
estimate_configurations(unify_solvers(df_solve_unconstrained[~df_solve_unconstrained['model-count-unconstrained-log10'].isna()&(df_solve_unconstrained['architecture'] == 'arm')]), 'model-count-unconstrained-log10', '\\arch{arm}')
estimate_configurations(unify_solvers(df_solve_unconstrained[~df_solve_unconstrained['model-count-unconstrained-log10'].isna()&((df_solve_unconstrained['architecture'] == 'i386') | (df_solve_unconstrained['architecture'] == 'x86'))]), 'model-count-unconstrained-log10', '\\arch{x86}')

fig = px.scatter(
    df_solve_total.replace(True, 'Exact').replace(False, 'Lower Bound'),
    x='committer_date',
    y='model-count-unconstrained',
    symbol='is-upper-bound',
    symbol_sequence=['circle', 'triangle-up-open'],
    facet_col='extractor',
    labels=revision_labels({'model-count-unconstrained': '#Configurations (Total, log<sub>10</sub>)', 'extractor': 'Extractor', 'is-upper-bound': 'Kind of Bound', 'committer_date': 'Year'}),
    category_orders={'extractor': ['KConfigReader', 'KClause']}
)
log10_y_axis(fig)
style_scatter(fig, legend_position='topright', xshift=0.01, yshift=0.03)
fig.update_traces(marker_line_color='rgba(0,0,0,0)')
fn1 = lambda prefix, y: prefix
fn2 = lambda prefix, y: '10<sup>' + format(round(y), ',') + '</sup>'
annotate_value(fig, 'committer_date', 0, 1, 'v2.5.45', 0, -15, 'center', df_solve_slice[df_solve_slice['revision'] == 'v2.5.45'], fn1)
annotate_value(fig, 'committer_date', 0, 1, 'v2.6.7', 0, -30, 'center', df_solve_slice[df_solve_slice['revision'] == 'v2.6.7'], fn1)
annotate_value(fig, 'committer_date', 0, 1, 'v2.6.13', 5, -15, 'center', df_solve_slice[df_solve_slice['revision'] == 'v2.6.13'], fn1)
annotate_value(fig, 'committer_date', 'model-count-unconstrained', 1, 'KCR', 15, 0, 'left',
               df_solve_slice[(df_solve_slice['extractor'] == 'KConfigReader')&(df_solve_slice['revision'] == 'v2.5.45')]
                    .groupby(['extractor', 'committer_date']).agg({'model-count-unconstrained': big_sum}).reset_index(), fn2)
annotate_value(fig, 'committer_date', 'model-count-unconstrained', 1, 'KCR', 10, 10, 'left',
               df_solve_slice[(df_solve_slice['extractor'] == 'KConfigReader')&(df_solve_slice['revision'] == 'v2.6.7')]
                    .groupby(['extractor', 'committer_date']).agg({'model-count-unconstrained': big_sum}).reset_index(), fn2)
annotate_value(fig, 'committer_date', 'model-count-unconstrained', 1, 'KCR', 15, -10, 'left',
               df_solve_slice[(df_solve_slice['extractor'] == 'KConfigReader')&(df_solve_slice['revision'] == 'v2.6.13')]
                    .groupby(['extractor', 'committer_date']).agg({'model-count-unconstrained': big_sum}).reset_index(), fn2)
annotate_value(fig, 'committer_date', 0, 2, 'v2.5.45', 0, -15, 'center', df_solve_slice[df_solve_slice['revision'] == 'v2.5.45'], fn1)
annotate_value(fig, 'committer_date', 0, 2, 'v2.6.23', 0, -15, 'center', df_solve_slice[df_solve_slice['revision'] == 'v2.6.23'], fn1)
annotate_value(fig, 'committer_date', 'model-count-unconstrained', 2, 'KCl', 25, 5, 'left',
               df_solve_slice[(df_solve_slice['extractor'] == 'KClause')&(df_solve_slice['revision'] == 'v2.5.45')]
                    .groupby(['extractor', 'committer_date']).agg({'model-count-unconstrained': big_sum}).reset_index(), fn2)
annotate_value(fig, 'committer_date', 'model-count-unconstrained', 2, 'KCl', 15, -15, 'left',
               df_solve_slice[(df_solve_slice['extractor'] == 'KClause')&(df_solve_slice['revision'] == 'v2.6.23')]
                    .groupby(['extractor', 'committer_date']).agg({'model-count-unconstrained': big_sum}).reset_index(), fn2)
fig.update_xaxes(range=["2002-01-01", "2024-12-01"])
fig.update_yaxes(range=[0, 1050], dtick=200)
show(fig, 'model-count-total', height=220, width=750)

fig = px.scatter(
    df_solve_slice,
    x='committer_date',
    y='model-count-unconstrained-log10',
    color='architecture',
    labels={'model-count-unconstrained-log10': '#Configurations (Arch., log<sub>10</sub>)', 'committer_date': 'Year', 'extractor': 'Extractor'},
    hover_data=['revision', 'architecture'],
    facet_col='extractor',
    category_orders={'extractor': ['KConfigReader', 'KClause']}
)
log10_y_axis(fig)
style_scatter(fig, legend_position=None, marker_size=2.5)
fn2 = lambda prefix, y: f'{prefix}: 10<sup>{round(y)}</sup>'
annotate_value(fig, 'committer_date', 0, 1, 'v2.6.13', 0, -15, 'center', df_solve_slice[df_solve_slice['revision'] == 'v2.6.13'], fn1)
annotate_value(fig, 'committer_date', 0, 1, 'v2.6.39', 0, -15, 'center', df_solve_slice[df_solve_slice['revision'] == 'v2.6.39'], fn1)
annotate_value(fig, 'committer_date', 'model-count-unconstrained-log10', 1, 'i386', 0, -20, 'center',
               df_solve_slice[(df_solve_slice['extractor'] == 'KConfigReader')&(df_solve_slice['architecture'] == 'i386')&(df_solve_slice['revision'] == 'v2.6.13')].dropna(), fn2)
annotate_value(fig, 'committer_date', 'model-count-unconstrained-log10', 1, 'h8300', 10, 10, 'left',
               df_solve_slice[(df_solve_slice['extractor'] == 'KConfigReader')&(df_solve_slice['architecture'] == 's390')&(df_solve_slice['revision'] == 'v2.6.39')].dropna(), fn2)
annotate_value(fig, 'committer_date', 0, 2, 'v2.6.23', 0, -15, 'center', df_solve_slice[df_solve_slice['revision'] == 'v2.6.23'], fn1)
annotate_value(fig, 'committer_date', 0, 2, 'v3.10', 0, -15, 'center', df_solve_slice[df_solve_slice['revision'] == 'v3.10'], fn1)
annotate_value(fig, 'committer_date', 'model-count-unconstrained-log10', 2, 'i386', 0, -20, 'center',
               df_solve_slice[(df_solve_slice['extractor'] == 'KClause')&(df_solve_slice['architecture'] == 'i386')&(df_solve_slice['revision'] == 'v2.6.23')&(df_solve_slice['backbone.dimacs-analyzer'] == 'model-counting-competition-2022/SharpSAT-td+Arjun/SharpSAT-td+Arjun.sh')].dropna(), fn2)
annotate_value(fig, 'committer_date', 'model-count-unconstrained-log10', 2, 'h8300', 10, 10, 'left',
               df_solve_slice[(df_solve_slice['extractor'] == 'KClause')&(df_solve_slice['architecture'] == 'h8300')&(df_solve_slice['revision'] == 'v3.10')].dropna(), fn2)
fig.update_xaxes(range=["2002-01-01", "2024-12-01"])
fig.update_yaxes(range=[0, 1050], dtick=200)
show(fig, 'model-count', height=220, width=750, margin=dict(l=0, r=0, t=2, b=0))

number of model-count attempts: 6479
number of inaccurate model counts: 0
number of model counts found: 2229
number of model counts only found by d4: 16
number of model counts only found by sharpsat: 182
number of model counts found by both: 2031
#model counts per architecture (KConfigReader):
min=149.67652498838132
median=554.3247423370464
max=876.236950562169
#model counts per architecture (KClause):
min=105.37617015509944
median=393.222670322056
max=745.5668618616759
degree of variability:
min=0.0
median=1.9924047504517453e-131
max=9.379724566743582e-27
pearson for KConfigReader: 1.0, 0.0
pearson for KClause: 1.0, 0.0
pearson for microblaze: 0.91, 0.28
pearson for parisc: 0.96, 0.0
pearson for mips64: 0.7, 0.0
pearson for sh: 0.97, 0.0
pearson for blackfin: -0.4, 0.33
pearson for powerpc: 0.98, 0.0
pearson for s390x: 0.95, 0.0
pearson for ia64: 0.99, 0.0
pearson for arm: 0.99, 0.0
pearson for m68knommu: 0.98, 0.0
pearson for mn10300: 1.0, 1.0
pearson for xtensa: 0.99, 0.0
pearson fo

In [13]:
# processor architectures (RQ1)

def add_state(df, criterion, name, size=1):
    df = pd.merge(df, criterion[['committer_date', 'architecture']].drop_duplicates(), on=['committer_date', 'architecture'], how='left', indicator='indicator')
    df['size'] = np.where(df['state'].isna() & (df['indicator'] == 'both'), size, df['size'])
    df['state'] = np.where(df['state'].isna() & (df['indicator'] == 'both'), name, df['state'])
    return df.drop(columns=['indicator'])

df_architectures_first_version = df_architectures.groupby('architecture').min().reset_index()
df_architectures_state = df_architectures
df_architectures_state['state'] = pd.NA
df_architectures_state['size'] = 1
df_architectures_state = add_state(df_architectures_state, df_features[df_features['#extracted_features'].isna()], 'Unextractable (Both)', 2.5)
df_architectures_state = add_state(df_architectures_state, df_features[df_features['extracted_features_jaccard'].isna() & (df_features['extractor'] == 'KConfigReader')], 'Unsatisfiable (KConfigReader)', 0.5)
df_architectures_state = add_state(df_architectures_state, unify_solvers(df_solve_unconstrained[(df_solve_unconstrained['extractor'] == 'KClause')].groupby(['extractor', 'revision', 'committer_date', 'architecture']).agg({'model-count-unconstrained-log10': lambda x: True in list(pd.notna(x))}).reset_index()).query('`model-count-unconstrained-log10` == False'), 'Uncountable (Both)', 0.1)
df_architectures_state = add_state(df_architectures_state, unify_solvers(df_solve_unconstrained[df_solve_unconstrained['model-count-unconstrained-log10'].isna() & (df_solve_unconstrained['extractor'] == 'KConfigReader')]), 'Countable (KClause)')
df_architectures_state = add_state(df_architectures_state, unify_solvers(df_solve_unconstrained), 'Countable (Both)')

fig = px.scatter(
    df_architectures_state,
    x='committer_date',
    y='architecture',
    labels=committer_date_labels({'architecture': 'Architecture', 'state': ''}),
    hover_data=['revision'],
    color='state',
    symbol='state',
    size='size',
    size_max=6.5,
    symbol_sequence=['circle-open', 'circle-open', 'line-ew', 'line-ns', 'x-thin'],
    category_orders={
        'architecture': list(df_architectures_first_version.sort_values(by='committer_date')['architecture']),
        'state': ['Countable (Both)', 'Countable (KClause)', 'Uncountable (Both)', 'Unsatisfiable (KConfigReader)', 'Unextractable (Both)']
    },
    color_discrete_sequence=['#648FFF', '#FE6100']
)

for row in range(len(df_architectures_first_version)):
    fig.add_annotation(
        x=df_architectures_first_version.at[row, 'committer_date'],
        y=df_architectures_first_version.at[row, 'architecture'],
        text=df_architectures_first_version.at[row, 'architecture'],
        showarrow=False, yshift=0, xshift=-5, font_size=10, xanchor='right', font_color='black'
    )

committer_date_x_axis(fig, step=1)
fig.update_xaxes(range=["2001-01-01", "2024-12-01"])
fig.update_yaxes(showticklabels=False)
fig.update_traces(line_color='rgba(0,0,0,1)')
style_scatter(fig, marker_size=None, legend_position='bottomleft')
show(fig, 'architectures', width=630)

a=df_architectures_state.merge(df_features[~df_features['#features'].isna()][['revision','architecture', 'extractor', '#features']].drop_duplicates()).merge(df_backbone_dimacs[['revision','architecture', 'extractor', 'dimacs-literals']].drop_duplicates())
print('maximum number of features in successful model counts: ' + str(a[a['state'].str.startswith('Countable')]['#features'].max()))
print('maximum number of dimacs literals in successful model counts: ' + str(a[a['state'].str.startswith('Countable')]['dimacs-literals'].max()))
print('minimum number of features in unsuccessful model counts: ' + str(a[a['state'].str.startswith('Uncountable')]['#features'].min()))
print('minimum number of dimacs literals in unsuccessful model counts: ' + str(a[a['state'].str.startswith('Uncountable')]['dimacs-literals'].min()))

maximum number of features in successful model counts: 3635.0
maximum number of dimacs literals in successful model counts: 814689
minimum number of features in unsuccessful model counts: 2373.0
minimum number of dimacs literals in unsuccessful model counts: 58910


In [14]:
# model count time

df_solve_slice = df_solve[~df_solve['model-count-log10'].isna()]
fig = px.scatter(
    df_solve_slice,
    x=df_solve_slice['committer_date'],
    y=df_solve_slice['backbone.dimacs-analyzer-time'] / 1000000000,
    color='architecture',
    labels={'extractor': 'Extractor', 'y': 'Time for Counting (log<sub>10</sub> s)', 'committer_date': 'Year'},
    facet_col='extractor',
    facet_row='backbone.dimacs-analyzer',
    log_y=True
)
style_scatter(fig, legend_position=None, marker_size=2.5)
show(fig, 'model-count-time', height=2*default_height, margin=dict(l=0, r=0, t=20, b=0))

In [15]:
# evolution (RQ4)

table_rows = []

for df in [df_features[df_features['year'] >= 2005]]:
    for (added, removed, df, label, file, y_color) in [('#total_added_features', '#total_removed_features', df[['extractor', '#total_added_features', '#total_removed_features']].drop_duplicates(), 'Change in #Features (log<sub>10</sub>)', 'total', 'black'), ('#added_features', '#removed_features', df, ' ', 'arch', 'white')]:
        df_features_long = pd.melt(
            df,
            id_vars=['extractor'],
            value_vars=[added, removed]
        )
        df_features_long.replace({'variable': added}, 'Added', inplace=True)
        df_features_long.replace({'variable': removed}, 'Removed', inplace=True)
        df_features_long.replace({'value': 0}, 1, inplace=True)

        for extractor in ['KConfigReader', 'KClause']:
            for variable in ['Added', 'Removed']:
                sign = '\color{green}+' if variable == 'Added' else '\color{red}-'
                df_row = df_features_long[(df_features_long['extractor']==extractor)&(df_features_long['variable']==variable)]['value']
                table_rows.append([f'{file} ({variable}, {extractor})', f'${sign}\\hfill\,\\text{{{round(df_row.max()):,}}}$', f'${sign}\\hfill\,\\text{{{round(df_row.quantile(.75)):,}}}$', f'${sign}\\hfill\,\\text{{{round(df_row.median()):,}}}$', f'${sign}\\hfill \\text{{{round(df_row.quantile(.25)):,}}}$', f'${sign}\\hfill\,\\text{{{round(df_row.min()):,}}}$'])

        fig = px.box(
            df_features_long,
            x='variable',
            y='value',
            color='extractor',
            facet_col='extractor',
            facet_col_spacing=0.1,
            labels={'value': label, 'variable': '', 'extractor': 'Extractor'},
            log_y=True,
            category_orders={'extractor': ['KConfigReader', 'KClause']}
        )
        fig.for_each_annotation(lambda a: a.update(text='Extractor=KCR' if a.text.split("=")[1] == 'KConfigReader' else 'Extractor=KCl'))
        fig.update_traces(width=0.5)
        fig.update_yaxes(tickvals=[1, 2, 10, 100, 1000], ticktext=['0', '10<sup>0</sup>', '10<sup>1</sup>', '10<sup>2</sup>', '10<sup>3</sup>'], range=[0, 3.7], tickfont=dict(color=y_color))
        style_box(fig, legend_position=None)
        show(fig, f'feature-evolution-{file}', height=260, width=2*120, margin=dict(l=0, r=0, t=21, b=0))

def plot_configuration_evolution(file, df, y, y_color='black'):
    for extractor in ['KConfigReader', 'KClause']:
        df_row = df[df['extractor']==extractor][y]
        sign = lambda x: '\color{green}' if round(x) > 0 else ('\color{red}' if round(x) < 0 else '')
        table_rows.append([f'{file} ({extractor})', f'${sign(df_row.max())}\cdot\,10^{{\\text{{{round(df_row.max()):,}}}}}$', f'${sign(df_row.quantile(.75))}\cdot\,10^{{\\text{{{round(df_row.quantile(.75)):,}}}}}$', f'${sign(df_row.median())}\cdot\,10^{{\\text{{{round(df_row.median()):,}}}}}$', f'${sign(df_row.quantile(.25))}\cdot\,10^{{\\text{{{round(df_row.quantile(.25)):,}}}}}$', f'${sign(df_row.min())}\cdot\,10^{{\\text{{{round(df_row.min()):,}}}}}$'])
        print(f'decrease for {file} ({extractor}): {len(df_row[df_row < 0])/len(df_row[~df_row.isna()])}')

    df['x'] = ' '
    fig = px.box(
        df,
        x='x',
        y=y,
        color='extractor',
        facet_col='extractor',
        labels={'model-count-unconstrained-log10': ' ', 'model-count-unconstrained': ' Scale of #Configurations (log<sub>10</sub>)', 'extractor': 'Extractor', 'x': ''},
        category_orders={'extractor': ['KConfigReader', 'KClause']}
    )
    fig.for_each_annotation(lambda a: a.update(text='KCR' if a.text.split("=")[1] == 'KConfigReader' else 'KCl'))
    fig.update_traces(width=0.5)
    fig.update_yaxes(range=[-277, 180], tickfont=dict(color=y_color))
    log10_y_axis(fig)
    style_box(fig, legend_position=None)
    show(fig, f'configuration-evolution-{file}', height=260, width=120, margin=dict(l=0, r=0, t=21, b=0))

df_solve_total_diff = df_solve_total.copy().sort_values(by='committer_date')
df_solve_total_diff = df_solve_total_diff[df_solve_total_diff['is-upper-bound']]
for extractor in set(df_solve_total_diff['extractor']):
        df_solve_total_diff.loc[(df_solve_total_diff['extractor'] == extractor), ['model-count-unconstrained']] = df_solve_total_diff[(df_solve_total_diff['extractor'] == extractor)]['model-count-unconstrained'].diff()
plot_configuration_evolution('total', df_solve_total_diff, 'model-count-unconstrained')

df_solve_unconstrained_diff = unify_solvers(df_solve_unconstrained).copy().sort_values(by='committer_date')
for extractor in set(df_solve_unconstrained_diff['extractor']):
    for architecture in set(df_solve_unconstrained_diff['architecture']):
        df_solve_unconstrained_diff.loc[(df_solve_unconstrained_diff['architecture'] == architecture) & (df_solve_unconstrained_diff['extractor'] == extractor), ['model-count-unconstrained-log10']] = df_solve_unconstrained_diff[(df_solve_unconstrained_diff['architecture'] == architecture) & (df_solve_unconstrained_diff['extractor'] == extractor)]['model-count-unconstrained-log10'].diff()
plot_configuration_evolution('arch', df_solve_unconstrained_diff, 'model-count-unconstrained-log10', 'white')

df_row = pd.DataFrame(table_rows, columns=['Quartile', 'Q4 (Maximum)', 'Q3', 'Q2 (Median)', 'Q1', 'Q0 (Minimum)']).transpose()
print(df_row.to_latex())

decrease for total (KConfigReader): 0.07894736842105263
decrease for total (KClause): 0.04


decrease for arch (KConfigReader): 0.1905286343612335
decrease for arch (KClause): 0.16681614349775784


\begin{tabular}{lllllllllllll}
\toprule
 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10 & 11 \\
\midrule
Quartile & total (Added, KConfigReader) & total (Removed, KConfigReader) & total (Added, KClause) & total (Removed, KClause) & arch (Added, KConfigReader) & arch (Removed, KConfigReader) & arch (Added, KClause) & arch (Removed, KClause) & total (KConfigReader) & total (KClause) & arch (KConfigReader) & arch (KClause) \\
Q4 (Maximum) & $\color{green}+\hfill\,\text{4,005}$ & $\color{red}-\hfill\,\text{3,772}$ & $\color{green}+\hfill\,\text{676}$ & $\color{red}-\hfill\,\text{1,355}$ & $\color{green}+\hfill\,\text{2,406}$ & $\color{red}-\hfill\,\text{2,303}$ & $\color{green}+\hfill\,\text{2,492}$ & $\color{red}-\hfill\,\text{2,231}$ & $\color{green}\cdot\,10^{\text{31}}$ & $\color{green}\cdot\,10^{\text{64}}$ & $\color{green}\cdot\,10^{\text{168}}$ & $\color{green}\cdot\,10^{\text{112}}$ \\
Q3 & $\color{green}+\hfill\,\text{284}$ & $\color{red}-\hfill\,\text{99}$ & $\color{green}+\hfill\,

In [16]:
# features versus configurations (RQ4.2)

df_features_and_configurations = pd.merge(df_features, unify_solvers(df_solve_unconstrained))
df_features_and_configurations = df_features_and_configurations[~df_features_and_configurations['model-count-unconstrained-log10'].isna()]
df_features_and_configurations_total = pd.merge(df_features.drop(columns=['#features']).drop_duplicates(), df_solve_total)
df_features_and_configurations_total = df_features_and_configurations_total[~df_features_and_configurations_total['model-count-unconstrained'].isna() & df_features_and_configurations_total['is-upper-bound']]
df_features_and_configurations_total.rename(columns={'#total_features': '#features', 'model-count-unconstrained': 'model-count-unconstrained-log10'}, inplace=True)
df_features_and_configurations_total['architecture'] = 'TOTAL'
df_features_and_configurations_total = df_features_and_configurations_total[['#features', 'model-count-unconstrained-log10', 'extractor', 'revision', 'architecture']].drop_duplicates().dropna()
df_features_and_configurations_scatter = df_features_and_configurations[['#features', 'model-count-unconstrained-log10', 'extractor', 'revision', 'architecture']]
df_features_and_configurations_scatter = pd.concat([df_features_and_configurations_scatter, df_features_and_configurations_total])

fig = px.scatter(
    df_features_and_configurations_scatter,
    x='#features',
    y='model-count-unconstrained-log10',
    color='architecture',
    color_discrete_map={'TOTAL': 'black'},
    labels={'model-count-unconstrained-log10': '#Configurations (log<sub>10</sub>)', '#features': '#Features', 'extractor': 'Extractor'},
    hover_data=['revision', 'architecture'],
    facet_col='extractor',
    category_orders={'extractor': ['KConfigReader', 'KClause']}
)
log10_y_axis(fig)
style_scatter(fig, legend_position=None, marker_size=2.5)
fn2 = lambda prefix, y: f'{prefix}: 10<sup>{round(y)}</sup>'
fig.update_xaxes(range=[0, 6800])
fig.update_yaxes(range=[0, 900], dtick=200)
show(fig, 'features-vs-configurations', height=220, width=750, margin=dict(l=0, r=0, t=20, b=0))

pearson_r_data = []
def pearson_r(label, extractor, architecture, df):
    if len(df['#features']) >= 4:
        s = scipy.stats.pearsonr(df['#features'], df['model-count-unconstrained-log10'])
        pearson_r_data.append({'extractor': extractor, 'pearson': s.statistic, 'p-value': s.pvalue, 'architecture': architecture})
        print(f'pearson for {label}: ' + str(round(s.statistic, 2)) + ', ' + str(round(s.pvalue, 2)))

pearson_r('KConfigReader', 'KConfigReader', None, df_features_and_configurations_total[df_features_and_configurations_total['extractor']=='KConfigReader'])
pearson_r('KClause', 'KClause', None, df_features_and_configurations_total[df_features_and_configurations_total['extractor']=='KClause'])

pearson_r_data = []
for extractor in ['KConfigReader', 'KClause']:
    for arch in set(df_features_and_configurations['architecture'].drop_duplicates()):
        pearson_r(f'{arch} ({extractor})', extractor, arch, df_features_and_configurations[(df_features_and_configurations['extractor']==extractor)&(df_features_and_configurations['architecture']==arch)])
pearson_r_data = pd.DataFrame(pearson_r_data)

for metric in ['pearson', 'p-value']:
    for extractor in ['KConfigReader', 'KClause']:
        print(f'{metric} for {extractor}:')
        print('min=' + str(pearson_r_data[pearson_r_data['extractor']==extractor][metric].min()))
        print('median=' + str(pearson_r_data[pearson_r_data['extractor']==extractor][metric].median()))
        print('max=' + str(pearson_r_data[pearson_r_data['extractor']==extractor][metric].max()))

pearson for KConfigReader: 0.94, 0.0
pearson for KClause: 0.99, 0.0
pearson for parisc (KConfigReader): 1.0, 0.0
pearson for mips64 (KConfigReader): 1.0, 0.0
pearson for sh (KConfigReader): 1.0, 0.0
pearson for s390x (KConfigReader): 1.0, 0.0
pearson for ia64 (KConfigReader): 1.0, 0.0
pearson for arm (KConfigReader): 1.0, 0.0
pearson for m68knommu (KConfigReader): 1.0, 0.0
pearson for xtensa (KConfigReader): 1.0, 0.0
pearson for x86_64 (KConfigReader): 1.0, 0.0
pearson for sparc64 (KConfigReader): 1.0, 0.0
pearson for cris (KConfigReader): 1.0, 0.0
pearson for ppc (KConfigReader): 0.99, 0.0
pearson for mips (KConfigReader): 1.0, 0.0
pearson for h8300 (KConfigReader): 0.99, 0.0
pearson for avr32 (KConfigReader): 1.0, 0.0
pearson for ppc64 (KConfigReader): 1.0, 0.0
pearson for arm26 (KConfigReader): 1.0, 0.0
pearson for alpha (KConfigReader): 1.0, 0.0
pearson for sh64 (KConfigReader): 1.0, 0.0
pearson for i386 (KConfigReader): 1.0, 0.0
pearson for frv (KConfigReader): 0.99, 0.0
pearson f

In [17]:
# feature and configuration prediction (RQ5)

def evaluate_metric(df, extractor, x_value, x, y):
    rows = df[(df['extractor'] == extractor) & (df[x] >= x_value)].sort_values(by=x)
    if len(rows) > 0:
        return rows[rows[x]==rows.iloc[0][x]][y].median()

def estimate_metric(df, x, y, extractor, key=lambda x: x.timestamp()):
        df_all = df[(df['extractor'] == extractor)].sort_values(by=x).dropna(subset = [y])
        if len(df_all) < 3:
             return []
        df_all['kind'] = 'actual'
        mid = len(df_all) * -1 // 2 * -1
        df_train = df_all[0:mid]
        df_test = df_all[mid:]
        fig = px.scatter(
            df_train,
            x=x,
            y=y,
            trendline='ols'
        )
        #show(fig, height=220, width=750)
        xs = list(df_test[x])
        _, _, _, _, estimated_values = estimate_trend(fig, xs=xs, key=key)
        actual_values = []
        deviations = []
        for (x_value, estimated_value) in zip(xs, estimated_values):
            actual_value = evaluate_metric(df, extractor, x_value, x, y)
            deviation = estimated_value / actual_value - 1
            deviations.append(deviation)
            actual_values.append(actual_value)
        df_estimate = df_test.copy()
        df_estimate[y] = estimated_values
        df_estimate['kind'] = 'estimate'
        fig = px.scatter(
            pd.concat([df_train, df_test, df_estimate]),
            x=x,
            y=y,
            color='kind'
        )
        #show(fig, height=220, width=750)
        return deviations

df_solve_total_exact = df_solve_total.copy().sort_values(by='committer_date')
df_solve_total_exact = df_solve_total_exact[df_solve_total_exact['is-upper-bound']]
df_solve_unconstrained_unified = unify_solvers(df_solve_unconstrained).copy().sort_values(by='committer_date')

deviations = []
for extractor in ['KConfigReader', 'KClause']:
    for (df, metric, x, column, arch, key) in \
        [(df_total_features, 'features', 'committer_date', '#total_features', 'TOTAL', None)] + \
        [(df_features[df_features['architecture'] == arch], 'features', 'committer_date', '#features', arch, None) for arch in set(df_features['architecture'].drop_duplicates())] + \
        [(df_solve_total_exact, 'configurations', 'committer_date', 'model-count-unconstrained', 'TOTAL', None)] + \
        [(df_solve_unconstrained_unified[df_solve_unconstrained_unified['architecture'] == arch], 'configurations', 'committer_date', 'model-count-unconstrained-log10', arch, None) for arch in set(df_solve_unconstrained_unified['architecture'].drop_duplicates())] + \
        [(df_features_and_configurations_total, 'configurations-by-features', '#features', 'model-count-unconstrained-log10', 'TOTAL', lambda x: x)] + \
        [(df_features_and_configurations[df_features_and_configurations['architecture'] == arch], 'configurations-by-features', '#features', 'model-count-unconstrained-log10', arch, lambda x: x) for arch in set(df_features_and_configurations['architecture'].drop_duplicates())]:
        print(f'{metric}, {extractor}, {arch}')
        current_deviations = estimate_metric(df, x, column, extractor, key if key is not None else lambda x: x.timestamp())
        deviations.extend([{'extractor': extractor, 'architecture': arch, 'deviation': deviation, 'is-total': arch == 'TOTAL', 'metric': metric} for deviation in current_deviations])
deviations = pd.DataFrame(deviations)
deviations.replace({'extractor': {'KConfigReader': 'KCR', 'KClause': 'KCl'}}, inplace=True)
deviations.replace({'is-total': {True: 'Total', False: 'Per Arch.'}}, inplace=True)
deviations

features, KConfigReader, TOTAL
features, KConfigReader, microblaze
features, KConfigReader, parisc
features, KConfigReader, x86
features, KConfigReader, mips64
features, KConfigReader, sh
features, KConfigReader, blackfin
features, KConfigReader, arc
features, KConfigReader, powerpc
features, KConfigReader, s390x
features, KConfigReader, openrisc
features, KConfigReader, ia64
features, KConfigReader, csky
features, KConfigReader, hexagon
features, KConfigReader, nds32
features, KConfigReader, arm
features, KConfigReader, loongarch
features, KConfigReader, m68knommu
features, KConfigReader, arm64
features, KConfigReader, riscv
features, KConfigReader, mn10300
features, KConfigReader, xtensa
features, KConfigReader, x86_64
features, KConfigReader, sparc64
features, KConfigReader, cris
features, KConfigReader, nios2
features, KConfigReader, ppc
features, KConfigReader, mips
features, KConfigReader, score
features, KConfigReader, h8300
features, KConfigReader, unicore32
features, KConfigRe

Unnamed: 0,extractor,architecture,deviation,is-total,metric
0,KCR,TOTAL,-0.080072,Total,features
1,KCR,TOTAL,-0.081010,Total,features
2,KCR,TOTAL,-0.079441,Total,features
3,KCR,TOTAL,-0.078734,Total,features
4,KCR,TOTAL,-0.081808,Total,features
...,...,...,...,...,...
5641,KCl,m32r,-0.030818,Per Arch.,configurations-by-features
5642,KCl,m32r,0.037476,Per Arch.,configurations-by-features
5643,KCl,m32r,0.034852,Per Arch.,configurations-by-features
5644,KCl,m32r,0.047720,Per Arch.,configurations-by-features


In [18]:
# feature and configuration prediction (RQ5)
for metric in ['features', 'configurations', 'configurations-by-features']:
    fig = px.box(
        deviations[deviations['metric']==metric],
        x='is-total',
        y='deviation',
        color='extractor',
        facet_col='extractor',
        facet_col_spacing=0.1,
        labels={'deviation': 'Relative Deviation' if metric == 'features' else '', 'extractor': 'Extractor'},
    )
    fig.update_traces(width=0.5)
    fig.update_yaxes(range=[-1.1, 1.1])
    percentage_y_axis(fig)
    style_box(fig, legend_position=None)
    show(fig, f'prediction-accuracy-{metric}', height=260, width=2*120, margin=dict(l=0, r=0, t=21, b=0))
    for extractor in ['KCR', 'KCl']:
        for kind in ['Total', 'Per Arch.']:
            print(f'{metric} for {extractor} ({kind}):')
            print('min={:.1%}'.format(deviations[(deviations['extractor']==extractor)&(deviations['metric']==metric)&(deviations['is-total']==kind)]['deviation'].min()))
            print('median={:.1%}'.format(deviations[(deviations['extractor']==extractor)&(deviations['metric']==metric)&(deviations['is-total']==kind)]['deviation'].median()))
            print('max={:.1%}'.format(deviations[(deviations['extractor']==extractor)&(deviations['metric']==metric)&(deviations['is-total']==kind)]['deviation'].max()))

features for KCR (Total):
min=-8.6%
median=-1.3%
max=6.3%
features for KCR (Per Arch.):
min=-70.2%
median=-10.3%
max=36.7%
features for KCl (Total):
min=-7.4%
median=-1.8%
max=9.8%
features for KCl (Per Arch.):
min=-72.7%
median=-9.9%
max=37.2%


configurations for KCR (Total):
min=-2.7%
median=-1.2%
max=-0.0%
configurations for KCR (Per Arch.):
min=-58.3%
median=0.1%
max=100.9%
configurations for KCl (Total):
min=-9.0%
median=-1.0%
max=0.5%
configurations for KCl (Per Arch.):
min=-76.9%
median=-0.2%
max=66.8%


configurations-by-features for KCR (Total):
min=-10.7%
median=-3.8%
max=-1.1%
configurations-by-features for KCR (Per Arch.):
min=-11.9%
median=0.5%
max=22.8%
configurations-by-features for KCl (Total):
min=-8.9%
median=-3.5%
max=-0.5%
configurations-by-features for KCl (Per Arch.):
min=-13.3%
median=1.1%
max=23.6%


In [19]:
# configuration prediction (RQ5.2)

def estimate_metric(date, extractor, df_features, features_y, fig_features, fig_configurations):
    features = evaluate_features(df_features, extractor, date, features_y, False)
    if features is None:
        _, _, _, _, [features] = estimate_trend(fig_features, 'extractor', extractor, xs=[date])
    _, _, _, _, [configurations] = estimate_trend(fig_configurations, 'extractor', extractor, xs=[features], key=lambda x: x)
    return configurations

df_configurations_by_features = []
def estimate_configurations_by_features(df_features, features_y, df_configurations, architecture):
    global df_configurations_by_features
    fig_features = px.scatter(
        df_features,
        x='committer_date',
        y=features_y,
        trendline='ols',
        color='extractor'
    )
    fig_configurations = px.scatter(
        df_configurations,
        x='#features',
        y='model-count-unconstrained-log10',
        trendline='ols',
        color='extractor'
    )
    for extractor in ['KConfigReader', 'KClause']:
        second_to_last_date, last_date = df_features['committer_date'].drop_duplicates()[-2:]
        for committer_date in list(df_features['committer_date'].drop_duplicates()) + [last_date + i * (last_date - second_to_last_date) for i in range(1, 106)]:
            df_configurations_by_features.append({
                'committer_date': committer_date,
                'extractor': extractor,
                'architecture': architecture,
                'model-count-unconstrained': estimate_metric(committer_date, extractor, df_features, features_y, fig_features, fig_configurations)
            })

estimate_configurations_by_features(df_features.groupby(['extractor', 'revision', 'committer_date']).min('#total_features').reset_index(), '#total_features', df_features_and_configurations_total, 'TOTAL')
estimate_configurations_by_features(df_features[df_features['architecture'] == 'arm'], '#features', df_features_and_configurations[df_features_and_configurations['architecture'] == 'arm'], 'arm')
estimate_configurations_by_features(df_features[(df_features['architecture'] == 'i386') | (df_features['architecture'] == 'x86')], '#features', df_features_and_configurations[(df_features_and_configurations['architecture'] == 'i386') | (df_features_and_configurations['architecture'] == 'x86')], 'i386')
df_configurations_by_features = pd.DataFrame(df_configurations_by_features).dropna()

def print_evaluation(fig, df_features, features_y, df_configurations, df_configurations_time, configurations_y, extractor, label, growth_prefix='', prefix='', postfix=''):
    daily, weekly, monthly, yearly, _ = estimate_trend(fig, 'extractor', extractor, [])
    values = []
    dates = [pd.Timestamp('2004-01-01'), pd.Timestamp('2014-01-01'), pd.Timestamp('2024-01-01'), pd.Timestamp('2034-01-01'), pd.Timestamp('2044-01-01')]
    fig_features = px.scatter(
        df_features,
        x='committer_date',
        y=features_y,
        trendline='ols',
        color='extractor'
    )
    fig_configurations = px.scatter(
        df_configurations,
        x='#features',
        y='model-count-unconstrained-log10',
        trendline='ols',
        color='extractor'
    )
    estimated_values = [estimate_metric(date, extractor, df_features, features_y, fig_features, fig_configurations) for date in dates]
    for (date, estimated_value) in zip(dates, estimated_values):
        actual_value = evaluate_features(df_configurations_time, extractor, date, configurations_y)
        if actual_value:
            values.append(f"{{\\color{{gray}}(${prefix}\\text{{{round(actual_value):,}}}{postfix}$)}}")
        else:
            values.append(f"${prefix}\\text{{{round(estimated_value):,}}}{postfix}$")
    sign = lambda x: '\color{green}' if round(x) > 0 else ('\color{red}' if round(x) < 0 else '')
    print('\hspace*{4mm} ' + f'{label} & ${sign(daily)}{growth_prefix}{prefix}\\text{{{round(daily):,}}}{postfix}$ & ${sign(weekly)}{growth_prefix}{prefix}\\text{{{round(weekly):,}}}{postfix}$ & ${sign(monthly)}{growth_prefix}{prefix}\\text{{{round(monthly):,}}}{postfix}$ & ${sign(yearly)}{growth_prefix}{prefix}\\text{{{round(yearly):,}}}{postfix}$ ' + " ".join([f"& {value}" for value in values]) + ' \\\\')

def estimate_configurations(df, df_features, features_y, df_configurations, df_configurations_time, configurations_y, name):
    fig = px.scatter(
        df,
        x='committer_date',
        y='model-count-unconstrained',
        trendline='ols',
        color='extractor'
    )
    estimate_group(name)
    for (extractor, label) in [('KConfigReader', '\\kcr'), ('KClause', '\\kcl')]:
        print_evaluation(fig, df_features, features_y, df_configurations, df_configurations_time, configurations_y, extractor, label, '\cdot\,', '10^{', '}')

estimate_configurations(df_configurations_by_features[(df_configurations_by_features['architecture']=='TOTAL')], df_features.groupby(['extractor', 'revision', 'committer_date']).min('#total_features').reset_index(), '#total_features', df_features_and_configurations_total, df_solve_total[df_solve_total['is-upper-bound'] == True], 'model-count-unconstrained', 'Total')
estimate_configurations(df_configurations_by_features[(df_configurations_by_features['architecture']=='arm')], df_features[df_features['architecture'] == 'arm'], '#features', df_features_and_configurations[df_features_and_configurations['architecture']=='arm'], unify_solvers(df_solve_unconstrained[~df_solve_unconstrained['model-count-unconstrained-log10'].isna()&(df_solve_unconstrained['architecture'] == 'arm')]), 'model-count-unconstrained-log10', '\\arch{arm}')
estimate_configurations(df_configurations_by_features[((df_configurations_by_features['architecture']=='i386')|(df_configurations_by_features['architecture']=='x86'))], df_features[(df_features['architecture'] == 'i386') | (df_features['architecture'] == 'x86')], '#features', df_features_and_configurations[(df_features_and_configurations['architecture']=='i386')|(df_features_and_configurations['architecture']=='x86')], unify_solvers(df_solve_unconstrained[~df_solve_unconstrained['model-count-unconstrained-log10'].isna()&((df_solve_unconstrained['architecture'] == 'i386') | (df_solve_unconstrained['architecture'] == 'x86'))]), 'model-count-unconstrained-log10', '\\arch{x86}')

for metric in ['features', 'configurations', 'configurations-by-features']:
    for architecture, df_architecture in [('TOTAL', deviations[deviations['architecture']=='TOTAL']), ('x86', deviations[(deviations['architecture']=='i386')|(deviations['architecture']=='x86')]), ('arm', deviations[deviations['architecture']=='arm'])]:
        for extractor in ['KCR', 'KCl']:
            median = df_architecture[(df_architecture['metric']==metric)&(df_architecture['extractor']==extractor)]['deviation'].median()
            print(f'& {median:.1%} \\\\'.replace('%', '\\%').replace('-', '$-$ '))


\hspace{2mm} Total \\
\hspace*{4mm} \kcr & $\cdot\,10^{\text{0}}$ & $\color{green}\cdot\,10^{\text{3}}$ & $\color{green}\cdot\,10^{\text{12}}$ & $\color{green}\cdot\,10^{\text{146}}$ & {\color{gray}($10^{\text{728}}$)} & $10^{\text{2,385}}$ & $10^{\text{3,536}}$ & $10^{\text{5,177}}$ & $10^{\text{6,638}}$ \\
\hspace*{4mm} \kcl & $\cdot\,10^{\text{0}}$ & $\color{green}\cdot\,10^{\text{1}}$ & $\color{green}\cdot\,10^{\text{6}}$ & $\color{green}\cdot\,10^{\text{78}}$ & {\color{gray}($10^{\text{484}}$)} & $10^{\text{1,390}}$ & $10^{\text{1,949}}$ & $10^{\text{2,843}}$ & $10^{\text{3,618}}$ \\
\hspace{2mm} \arch{arm} \\
\hspace*{4mm} \kcr & $\cdot\,10^{\text{0}}$ & $\color{green}\cdot\,10^{\text{3}}$ & $\color{green}\cdot\,10^{\text{13}}$ & $\color{green}\cdot\,10^{\text{154}}$ & {\color{gray}($10^{\text{670}}$)} & $10^{\text{2,145}}$ & $10^{\text{3,673}}$ & $10^{\text{5,279}}$ & $10^{\text{6,822}}$ \\
\hspace*{4mm} \kcl & $\cdot\,10^{\text{0}}$ & $\color{green}\cdot\,10^{\text{2}}$ & $\col

In [20]:
# feature and configuration prediction (RQ5)

df_prediction_total = df_solve_total[df_solve_total['is-upper-bound'] == True][['committer_date', 'model-count-unconstrained', 'extractor']]
df_prediction_total['architecture'] = 'TOTAL'

# configurations by features
df_prediction_configurations_by_features = pd.concat([
    df_prediction_total,
    unify_solvers(df_solve_unconstrained[~df_solve_unconstrained['model-count-unconstrained-log10'].isna()&((df_solve_unconstrained['architecture'] == 'i386') | (df_solve_unconstrained['architecture'] == 'x86') | (df_solve_unconstrained['architecture'] == 'arm'))])[['committer_date', 'model-count-unconstrained-log10', 'extractor', 'architecture']].rename(columns={'model-count-unconstrained-log10': 'model-count-unconstrained'})
])
df_prediction_configurations_by_features['is-prediction'] = False
df_prediction_configurations_by_features = pd.merge(df_prediction_configurations_by_features, df_configurations_by_features, on=['committer_date', 'extractor', 'architecture'], how='right')
df_prediction_configurations_by_features['model-count-unconstrained_x'] = df_prediction_configurations_by_features['model-count-unconstrained_x'].mask(df_prediction_configurations_by_features['model-count-unconstrained_x'].isna(), df_prediction_configurations_by_features['model-count-unconstrained_y'])
df_prediction_configurations_by_features['is-prediction'] = df_prediction_configurations_by_features['is-prediction'].mask(df_prediction_configurations_by_features['is-prediction'].isna(), True)
df_prediction_configurations_by_features.rename(columns={'model-count-unconstrained_x': 'model-count-unconstrained'}, inplace=True)
df_prediction_configurations_by_features.drop(columns=['model-count-unconstrained_y'], inplace=True)

# configurations
df_prediction_tmp = []
def estimate_prediction(df, y, architecture):
    global df_prediction_tmp
    fig = px.scatter(
        df,
        x='committer_date',
        y=y,
        trendline='ols',
        color='extractor'
    )
    for extractor in ['KConfigReader', 'KClause']:
        second_to_last_date, last_date = df_features['committer_date'].drop_duplicates()[-2:]
        for committer_date in list(df_features['committer_date'].drop_duplicates()) + [last_date + i * (last_date - second_to_last_date) for i in range(1, 106)]:
            _, _, _, _, [estimation] = estimate_trend(fig, 'extractor', extractor, xs=[committer_date])
            df_prediction_tmp.append({
                'committer_date': committer_date,
                'extractor': extractor,
                'architecture': architecture,
                'estimation': estimation
            })
estimate_prediction(df_solve_total[df_solve_total['is-upper-bound'] == True], 'model-count-unconstrained', 'TOTAL')
estimate_prediction(unify_solvers(df_solve_unconstrained[~df_solve_unconstrained['model-count-unconstrained-log10'].isna()&(df_solve_unconstrained['architecture'] == 'arm')]), 'model-count-unconstrained-log10', 'arm')
estimate_prediction(unify_solvers(df_solve_unconstrained[~df_solve_unconstrained['model-count-unconstrained-log10'].isna()&((df_solve_unconstrained['architecture'] == 'i386') | (df_solve_unconstrained['architecture'] == 'x86'))]), 'model-count-unconstrained-log10', 'i386')
df_prediction_tmp = pd.DataFrame(df_prediction_tmp)
df_prediction_tmp.rename(columns={'estimation': 'model-count-unconstrained'}, inplace=True)
df_prediction_configurations = pd.concat([
    df_prediction_total,
    unify_solvers(df_solve_unconstrained[~df_solve_unconstrained['model-count-unconstrained-log10'].isna()&((df_solve_unconstrained['architecture'] == 'i386') | (df_solve_unconstrained['architecture'] == 'x86') | (df_solve_unconstrained['architecture'] == 'arm'))])[['committer_date', 'model-count-unconstrained-log10', 'extractor', 'architecture']].rename(columns={'model-count-unconstrained-log10': 'model-count-unconstrained'})
])
df_prediction_configurations['is-prediction'] = False
df_prediction_configurations = pd.merge(df_prediction_configurations, df_prediction_tmp, on=['committer_date', 'extractor', 'architecture'], how='right')
df_prediction_configurations['model-count-unconstrained_x'] = df_prediction_configurations['model-count-unconstrained_x'].mask(df_prediction_configurations['model-count-unconstrained_x'].isna(), df_prediction_configurations['model-count-unconstrained_y'])
df_prediction_configurations['is-prediction'] = df_prediction_configurations['is-prediction'].mask(df_prediction_configurations['is-prediction'].isna(), True)
df_prediction_configurations.rename(columns={'model-count-unconstrained_x': 'model-count-unconstrained'}, inplace=True)
df_prediction_configurations.drop(columns=['model-count-unconstrained_y'], inplace=True)

# features
df_prediction_tmp = []
estimate_prediction(df_features.groupby(['extractor', 'revision', 'committer_date']).min('#total_features').reset_index(), f'#total_features', 'TOTAL')
estimate_prediction(df_features[df_features['architecture'] == 'arm'], '#features', 'arm')
estimate_prediction(df_features[(df_features['architecture'] == 'i386') | (df_features['architecture'] == 'x86')], '#features', 'x86')
df_prediction_tmp = pd.DataFrame(df_prediction_tmp)
df_prediction_tmp.rename(columns={'estimation': 'model-count-unconstrained'}, inplace=True)
df_prediction_features = pd.concat([
    df_features.groupby(['extractor', 'revision', 'committer_date']).min('#total_features').reset_index().rename(columns={'#total_features': 'model-count-unconstrained'}).assign(architecture='TOTAL'),
    df_features[(df_features['architecture'] == 'arm') | (df_features['architecture'] == 'i386') | (df_features['architecture'] == 'x86')][['committer_date', '#features', 'extractor', 'architecture']].rename(columns={'#features': 'model-count-unconstrained'}).replace({'architecture': {'i386': 'x86'}})
])
df_prediction_features['is-prediction'] = False
df_prediction_features = pd.merge(df_prediction_features, df_prediction_tmp, on=['committer_date', 'extractor', 'architecture'], how='right')
df_prediction_features['model-count-unconstrained_x'] = df_prediction_features['model-count-unconstrained_x'].mask(df_prediction_features['model-count-unconstrained_x'].isna(), df_prediction_features['model-count-unconstrained_y'])
df_prediction_features['is-prediction'] = df_prediction_features['is-prediction'].mask(df_prediction_features['is-prediction'].isna(), True)
df_prediction_features.rename(columns={'model-count-unconstrained_x': 'model-count-unconstrained'}, inplace=True)
df_prediction_features.drop(columns=['model-count-unconstrained_y'], inplace=True)

fn1 = lambda prefix, y: prefix
for metric, df, label in [('configurations-by-features', df_prediction_configurations_by_features, '#Configurations (log<sub>10</sub>)'), ('configurations', df_prediction_configurations, '#Configurations (log<sub>10</sub>)'), ('features', df_prediction_features, '#Features')]:
    fig = px.scatter(
        df,
        x='committer_date',
        y='model-count-unconstrained',
        color='architecture',
        color_discrete_map={'TOTAL': 'black'},
        symbol='is-prediction',
        symbol_sequence=['circle', 'line-ne'],
        labels={'model-count-unconstrained': '', 'committer_date': 'Year', 'extractor': 'Extractor'},
        #size=(df[df['extractor'] == 'KClause'])['is-prediction'].map({True: 10, False: 100}),
    )
    style_scatter(fig, legend_position=None, marker_size=2.5)
    if metric == 'features':
        annotate_value(fig, 'committer_date', 'model-count-unconstrained', 1, 'Total', 0, 55, 'center', df[(df['extractor'] == 'KClause')&(df['architecture'] == 'TOTAL')&(df['committer_date'] >= pd.Timestamp('2030-01-01'))], fn1)
        annotate_value(fig, 'committer_date', 'model-count-unconstrained', 1, 'arm', 0, 45, 'center', df[(df['extractor'] == 'KClause')&(df['architecture'] == 'arm')&(df['committer_date'] >= pd.Timestamp('2035-01-01'))], fn1)
        annotate_value(fig, 'committer_date', 'model-count-unconstrained', 1, 'x86', 0, 48, 'center', df[(df['extractor'] == 'KClause')&(df['architecture'] == 'x86')&(df['committer_date'] >= pd.Timestamp('2040-01-01'))], fn1)
    else:
        if metric == 'configurations':
            annotate_value(fig, 'committer_date', 'model-count-unconstrained', 1, 'KConfigReader', 55.5, -50, 'left', df[(df['extractor'] == 'KConfigReader')&(df['architecture'] == 'TOTAL')&(df['committer_date'] >= pd.Timestamp('2006-01-01'))], fn1)
            annotate_value(fig, 'committer_date', 'model-count-unconstrained', 1, 'KClause', 45, 0, 'left', df[(df['extractor'] == 'KClause')&(df['architecture'] == 'arm')&(df['committer_date'] >= pd.Timestamp('2008-01-01'))], fn1)
            extractor_y_start = 500
            extractor_y_max = 8100
            extractor_y_split = 3200
        else:
            extractor_y_start = 250
            extractor_y_max = 8100
            extractor_y_split = 5700
        log10_y_axis(fig)
        fig.update_yaxes(range=[0, 8100], dtick=1000)
        fig.add_trace(go.Scatter(x=[pd.Timestamp('2002-10-01'), pd.Timestamp('2002-10-01'), pd.Timestamp('2044-02-01'), pd.Timestamp('2044-02-01'), pd.Timestamp('2002-10-01')], y=[0, extractor_y_start, extractor_y_split, 0, 0], fill="toself", fillcolor='rgba(254,97,0,0.15)', line_color='rgba(0,0,0,0)'))
        fig.add_trace(go.Scatter(x=[pd.Timestamp('2002-10-01'), pd.Timestamp('2002-10-01'), pd.Timestamp('2044-02-01'), pd.Timestamp('2044-02-01'), pd.Timestamp('2002-10-01')], y=[0, extractor_y_start, extractor_y_split, extractor_y_max, extractor_y_max], fill="toself", fillcolor='rgba(100,143,255,0.15)', line_color='rgba(0,0,0,0)'))
    show(fig, f'prediction-{metric}', height=0.75*220, width=0.75*375, margin=dict(l=0, r=0, t=0, b=0))

In [21]:
df_configs_by_directory = df_configs_configurable[df_configs_configurable['configurable']].copy()
df_configs_by_directory['directory'] = df_configs_by_directory['kconfig-file'].apply(lambda x: x.split('/')[0])
df_configs_by_directory['subdirectory'] = df_configs_by_directory['kconfig-file'].apply(lambda x: "/".join(x.split('/')[0:2]))
df_configs_by_directory = pd.merge(df_configs_by_directory, df_kconfig[['committer_date', 'revision']].drop_duplicates(), how='left')
df_configs_by_directory_1 = df_configs_by_directory.groupby(['committer_date', 'revision', 'directory']).size().reset_index(name='count')
df_configs_by_directory_2 = df_configs_by_directory.groupby(['committer_date', 'revision', 'directory', 'subdirectory']).size().reset_index(name='count')

def pearson_r(label, df):
    s = scipy.stats.pearsonr(df['committer_date'].astype(int) // 10 ** 9, df['count'])
    print(f'pearson for {label}: ' + str(round(s.statistic, 2)) + ', ' + str(round(s.pvalue, 2)))

for df, column in [(df_configs_by_directory_1, 'directory'), (df_configs_by_directory_2, 'subdirectory')]:
    for directory in set(df[column]):
        df_column = df[df[column] == directory]['count']
        if df_column.max() - df_column.min() <= 500:
            df = df.drop(df[df[column] == directory].index)

    for column_value in set(df[column]):
        pearson_r(f'{column}={column_value}', df[df[column] == column_value])

    fig = px.scatter(
        df,
        x='committer_date',
        y='count',
        color='directory',
        hover_data=['revision', column],
        symbol_sequence=['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up', 'triangle-down', 'star', 'hexagon', 'pentagon'],
        labels={'count': '#Features (Subsystem)' if column == 'directory' else '', 'committer_date': 'Year', 'directory': ''},
        category_orders={'directory': ['drivers', 'arch', 'sound', 'net', 'lib']},
    )
    fn1 = lambda prefix, y: prefix
    if column == 'directory':
        annotate_value(fig, 'committer_date', 'count', 1, 'drivers', 0, -20, 'center', df[(df['directory'] == 'drivers')&(df['committer_date'] >= pd.Timestamp('2021-01-01'))], fn1)
        annotate_value(fig, 'committer_date', 'count', 1, 'arch', 0, -20, 'center', df[(df['directory'] == 'arch')&(df['committer_date'] >= pd.Timestamp('2021-01-01'))], fn1)
        annotate_value(fig, 'committer_date', 'count', 1, 'sound', 0, -20, 'center', df[(df['directory'] == 'sound')&(df['committer_date'] >= pd.Timestamp('2021-01-01'))], fn1)
        annotate_value(fig, 'committer_date', 'count', 1, 'net', 0, -21.9, 'center', df[(df['directory'] == 'net')&(df['committer_date'] >= pd.Timestamp('2023-01-01'))], fn1)
    else:
        annotate_value(fig, 'committer_date', 'count', 1, 'arch/arm', -30, 0, 'right', df[(df['subdirectory'] == 'arch/arm')&(df['committer_date'] >= pd.Timestamp('2011-03-01'))], fn1)
        annotate_value(fig, 'committer_date', 'count', 1, 'drivers/net', -40, 0, 'right', df[(df['subdirectory'] == 'drivers/net')&(df['committer_date'] >= pd.Timestamp('2020-01-01'))], fn1)
        annotate_value(fig, 'committer_date', 'count', 1, 'sound/soc', 65, 30, 'left', df[(df['subdirectory'] == 'sound/soc')&(df['committer_date'] >= pd.Timestamp('2016-01-01'))], fn1)
    style_scatter(fig, legend_position=None, marker_size=2.5)
    fig.update_xaxes(range=[pd.Timestamp('2002-10-01'), pd.Timestamp('2025-04-01')])
    show(fig, f'subsystem-{column}', height=200, width=375, margin=dict(l=0, r=0, t=2, b=0))


pearson for directory=lib: 0.99, 0.0
pearson for directory=arch: 0.74, 0.0
pearson for directory=sound: 0.99, 0.0
pearson for directory=drivers: 1.0, 0.0
pearson for directory=net: 0.99, 0.0


pearson for subdirectory=arch/mips: 0.92, 0.0
pearson for subdirectory=drivers/media: 0.99, 0.0
pearson for subdirectory=arch/arm: 0.82, 0.0
pearson for subdirectory=drivers/clk: 0.97, 0.0
pearson for subdirectory=arch/blackfin: 0.94, 0.0
pearson for subdirectory=sound/soc: 1.0, 0.0
pearson for subdirectory=drivers/net: 1.0, 0.0
pearson for subdirectory=drivers/iio: 0.99, 0.0
