In [128]:
# read CSV data

import glob
import re
import plotly.express as px
import pandas as pd
import numpy as np
import os.path
import pickle

output_directory = 'output-clean'
figures_directory = '../../paper-icse-2024-linux/figures'

def estimate_trend(fig, message='', idx=0, date1=pd.Timestamp.now() - pd.Timedelta(days=365.25*20), date2=pd.Timestamp.now()):
    results = px.get_trendline_results(fig)
    intercept = results.iloc[idx]['px_fit_results'].params[0]
    slope = results.iloc[idx]['px_fit_results'].params[1]
    daily = slope * pd.to_timedelta(1, unit='D').total_seconds()
    monthly = slope * pd.to_timedelta(1, unit='D').total_seconds() * 30.437
    yearly = slope * pd.to_timedelta(1, unit='D').total_seconds() * 365.25
    on_date1 = intercept + slope * date1.timestamp()
    on_date2 = intercept + slope * date2.timestamp()
    print(f'{message} & {round(daily):,} & {round(monthly):,} & {round(yearly):,} & {round(on_date1):,} & {round(on_date2):,} \\\\')

def read_dataframe(stage, dtype={}):
    df = pd.read_csv(f'../{output_directory}/{stage}/output.csv', dtype=dtype)
    if 'committer_date_unix' in df:
        df['committer_date'] = df['committer_date_unix'].apply(lambda d: pd.to_datetime(d, unit='s'))
    return df

def peek_dataframe(df, column, message, type='str', filter=['revision', 'architecture', 'extractor']):
    success = df[~df[column].str.contains('NA') if type == 'str' else ~df[column].isna()][filter]
    failure = df[df[column].str.contains('NA') if type == 'str' else df[column].isna()][filter]
    print(f'{message}: {len(success)} successes, {len(failure)} failures')

df_architectures = read_dataframe(f'read-linux-architectures')
df_architectures = df_architectures.sort_values(by='committer_date')
df_architectures['year'] = df_architectures['committer_date'].apply(lambda d: int(d.year))

df_kconfig = read_dataframe('kconfig')
peek_dataframe(df_kconfig, 'model-file', 'model extraction')

df_uvl = read_dataframe('model_to_uvl_featureide')
peek_dataframe(df_uvl, 'uvl-file', 'UVL transformation', 'na', ['model-file'])

df_xml = read_dataframe('model_to_xml_featureide')
peek_dataframe(df_xml, 'xml-file', 'XML transformation', 'na', ['model-file'])

df_smt = read_dataframe('model_to_smt_z3')
peek_dataframe(df_smt, 'smt-file', 'SMT transformation', 'na', ['model-file'])

df_dimacs = read_dataframe('dimacs')
peek_dataframe(df_dimacs, 'dimacs-file', 'CNF transformation')

df_backbone_dimacs = read_dataframe('backbone-dimacs')
peek_dataframe(df_backbone_dimacs, 'backbone.dimacs-file', 'backbone transformation', 'na')
# todo: also count all dimacs files with trivial contradictions (p cnf 1 1)

df_solve = read_dataframe('solve_model-count', {'model-count': 'string'})
df_solve['model-count'] = df_solve['model-count'].replace('1', '')
df_solve['model-count-log10'] = df_solve['model-count'].fillna('').map(len).replace(0, np.nan)
peek_dataframe(df_solve, 'model-count-log10', 'model counting', 'na')

def committer_date_x_axis(fig, df=df_kconfig, append_revision=True):
    axis = df_kconfig[['committer_date', 'revision']].drop_duplicates()
    axis['year'] = axis['committer_date'].apply(lambda d: str(d.year))
    axis = axis.sort_values(by='committer_date').groupby('year').nth(0).reset_index()
    fig.update_xaxes(
        ticktext=axis['year'].str.cat('<br><sup>' + axis['revision'].str[1:] + '</sup>')[1:] if append_revision else axis['year'],
        tickvals=axis['year'][1:]
    )

def revision_x_axis(fig, df=df_kconfig):
    axis = df_kconfig[['committer_date', 'revision']].drop_duplicates()
    axis['year'] = axis['committer_date'].apply(lambda d: str(d.year))
    axis = axis.sort_values(by='committer_date').groupby('year').nth(0).reset_index()
    fig.update_xaxes(
        ticktext=axis['year'],
        tickvals=axis['revision']
    )

def log10_y_axis(fig):
    fig.update_yaxes(tickprefix = "10<sup>", ticksuffix = "</sup>")

def committer_date_labels(dict={}):
    return {'committer_date': 'Year / First Release in Year'} | dict

def revision_labels(dict={}):
    return {'revision': 'Year'} | dict

def style_boxplot(fig):
    fig.update_layout(legend=dict(yanchor='top', y=0.98, xanchor='left', x=0.01))
    fig.update_traces(fillcolor='rgba(0,0,0,0)')
    fig.update_traces(line_width=1)
    fig.update_traces(marker_size=2)

def show(fig, name=None, width=1000, height=500, crop=True):
    fig.update_layout(width=width, height=height)
    if crop:
        fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    if figures_directory and name:
        fig.write_image(f'{figures_directory}/{name}.pdf')
    fig.show()

model extraction: 6246 successes, 46 failures
UVL transformation: 4991 successes, 1255 failures
XML transformation: 6215 successes, 31 failures
SMT transformation: 6246 successes, 0 failures
CNF transformation: 6246 successes, 0 failures
backbone transformation: 6175 successes, 71 failures
model counting: 923 successes, 601 failures


In [125]:
# differentiate kinds of features (takes up to an hour to run)

load_feature_data = True

def add_features(descriptor, source, features):
    descriptor[f'{source}_features'] = features if len(features) > 1 else set()
    descriptor[f'#{source}_features'] = len(features) if len(features) > 1 else np.nan

def add_feature_statistics(data, architectures, source):
    total_features = set.union(*[d[f'{source}_features'] for d in data])
    common_features = set.intersection(*[d[f'{source}_features'] for d in data])
    for architecture in architectures:
        descriptor = [d for d in data if d['architecture'] == architecture][0]
        features_in_other_architectures = set.union(*[d[f'{source}_features'] for d in data if d['architecture'] != architecture])
        # todo: possibly add "features owned by exactly n arch's" (e.g., 2)
        descriptor[f'#{source}_total_features'] = len(total_features)
        descriptor[f'#{source}_common_features'] = len(common_features)
        owned_features = descriptor[f'{source}_features'].difference(features_in_other_architectures)
        descriptor[f'#{source}_owned_features'] = len(owned_features)
        descriptor[f'#{source}_shared_features'] = len(descriptor[f'{source}_features'].difference(common_features).difference(owned_features))

def clean_features(descriptor, source):
    descriptor.pop(f'{source}_features')

def inspect_architecture_features_for_model(extractor, revision, architecture):
    extractor_features_filename = f'../{output_directory}/kconfig/{extractor}/linux/{revision}[{architecture}].features'
    dimacs_filename = f'../{output_directory}/backbone-dimacs/{extractor}/linux/{revision}[{architecture}].backbone.dimacs'
    dimacs_features = set()
    core_dimacs_features = set()
    dead_dimacs_features = set()
    features_jaccard = np.nan
    with open(extractor_features_filename, 'r') as f:
        extractor_features = set([re.sub('^CONFIG_', '', f.strip()) for f in f.readlines()])
    if os.path.isfile(dimacs_filename):
        with open(dimacs_filename, 'r') as f:
            lines = f.readlines()
            variable_map = {}
            for f in lines:
                if f.startswith('c ') and "k!" not in f and "__VISIBILITY__CONFIG_" not in f:
                    result = re.search('^c ([^ ]+) ([^ ]+)$', f)
                    if result:
                        variable_map[int(result.group(1).strip())] = result.group(2).strip()
            dimacs_features = set(variable_map.values())
            if len(dimacs_features) <= 1:
                dimacs_features = set()
            else:
                features_jaccard = len(set.intersection(extractor_features, dimacs_features)) / len(set.union(extractor_features, dimacs_features))
                for f in lines:
                    result = re.search('^([^ ]+) 0$', f)
                    if result:
                        literal = int(result.group(1))
                        index = abs(literal)
                        if index in variable_map:
                            if literal > 0:
                                core_dimacs_features.add(variable_map[index])
                            else:
                                dead_dimacs_features.add(variable_map[index])
    dimacs_no_dead_features = set.difference(dimacs_features, dead_dimacs_features)
    dimacs_no_dead_no_core_features = set.difference(dimacs_no_dead_features, core_dimacs_features)
    descriptor = {'extractor': extractor, 'revision': revision, 'architecture': architecture, 'features_jaccard': features_jaccard}
    add_features(descriptor, 'extractor', extractor_features)
    add_features(descriptor, 'dimacs', dimacs_features)
    add_features(descriptor, 'core_dimacs', core_dimacs_features)
    add_features(descriptor, 'dead_dimacs', dead_dimacs_features)
    add_features(descriptor, 'dimacs_no_dead', dimacs_no_dead_features)
    add_features(descriptor, 'dimacs_no_dead_no_core', dimacs_no_dead_no_core_features)
    return descriptor

def inspect_architecture_features_for_revision(extractor, revision):
    architectures = [re.search('\[(.*)\]', f).group(1) for f in glob.glob(f'../{output_directory}/kconfig/{extractor}/linux/{revision}[*.features')]
    architectures = list(set(architectures))
    architectures.sort()
    data = []
    for architecture in architectures:
        data.append(inspect_architecture_features_for_model(extractor, revision, architecture))
    sources = ['extractor', 'dimacs', 'core_dimacs', 'dead_dimacs', 'dimacs_no_dead', 'dimacs_no_dead_no_core']
    for source in sources:
        add_feature_statistics(data, architectures, source)
    for source in sources:
        for descriptor in data:
            clean_features(descriptor, source)
    return data

def inspect_architecture_features(extractor):
    revisions = [re.search('linux/(.*)\[', f).group(1) for f in glob.glob(f'../{output_directory}/kconfig/{extractor}/linux/*.features')]
    revisions = list(set(revisions))
    revisions.sort()
    data = []
    i = 0
    for revision in revisions:
        i += 1
        if i % 10 == 0:
            print(revision + ' . ', end='')
        data += inspect_architecture_features_for_revision(extractor, revision)
    print()
    return data

if load_feature_data:
    with open('features.dat', 'rb') as f:
        features_by_kind_per_architecture = pickle.load(f)
else:
    features_by_kind_per_architecture = inspect_architecture_features('kconfigreader')
    features_by_kind_per_architecture += inspect_architecture_features('kmax')
    features_by_kind_per_architecture = pd.DataFrame(features_by_kind_per_architecture)
    with open('features.dat', 'wb') as f:
        pickle.dump(features_by_kind_per_architecture, f)

v2.5.54 . v2.5.64 . v2.5.74 . v2.6.16 . v2.6.25 . v2.6.34 . v2.6.8 . v3.16 . v3.8 . v4.16 . v4.7 . v5.15 . v5.7 . 
v2.5.54 . v2.5.64 . v2.5.74 . v2.6.16 . v2.6.25 . v2.6.34 . v2.6.8 . v3.16 . v3.8 . v4.16 . v4.7 . v5.15 . v5.7 . 


In [129]:
# merge CSV and feature data

load_feature_data = True
if load_feature_data:
    with open('features.dat', 'rb') as f:
        features_by_kind_per_architecture = pickle.load(f)
else:
    with open('features.dat', 'wb') as f:
        pickle.dump(features_by_kind_per_architecture, f)

# todo: use extractor or dimacs features?
df_features = pd.merge(df_architectures, features_by_kind_per_architecture).sort_values(by='committer_date')
df = pd.merge(df_kconfig, df_features).sort_values(by='committer_date')

In [130]:

# todo: nan values?
# todo: ignore kmax visibility __VISIBILITY__CONFIG_ variables?
fig = px.box(
    df,
    x='year',
    y='features_jaccard',
    color='extractor',
    labels={'features_jaccard': 'Jaccard Similarity of Features and Variables', 'extractor': 'Extractor', 'year': 'Year'}
)
style_boxplot(fig)
show(fig, 'features-jaccard', height=350, width=500)

In [94]:
# share of dead features

fig = px.box(
    df.assign(share_of_dead_features=df['#dead_dimacs_features'] / df['#dimacs_features']),
    x='year',
    y='share_of_dead_features',
    color='extractor',
    labels={'share_of_dead_features': 'Share of Dead Features', 'extractor': 'Extractor', 'year': 'Year'},
    #boxmode='overlay'
)
style_boxplot(fig)
show(fig, 'share-of-dead-features', height=350, width=500)

In [106]:
def plot_features_by_kind_per_revision(source):
    features_by_kind_per_revision = features_by_kind_per_architecture.groupby(['extractor', 'revision']) \
        .agg({f'#{source}_total_features': 'min', f'#{source}_common_features': 'min', f'#{source}_owned_features': 'sum'}).reset_index()
    features_by_kind_per_revision[f'#{source}_shared_features'] = features_by_kind_per_revision[f'#{source}_total_features'] \
        - features_by_kind_per_revision[f'#{source}_common_features'] \
        - features_by_kind_per_revision[f'#{source}_owned_features']
    features_by_kind_per_revision = pd.merge(df_kconfig[['committer_date', 'revision']].drop_duplicates(), features_by_kind_per_revision)

    # todo: currently only for kconfigreader
    # fig = px.bar(
    #     features_by_kind_per_revision.sort_values(by='committer_date'),
    #     x='revision',
    #     y=[f'#{source}_common_features', f'#{source}_shared_features', f'#{source}_owned_features'],
    #     labels=revision_labels({'value': 'Number of Features', 'variable': 'Feature Kind'}),
    #     facet_row='extractor'
    # )
    # fig = px.line(
    #     features_by_kind_per_revision.sort_values(by='committer_date'),
    #     x='revision',
    #     y=[f'#{source}_total_features'],
    #     labels=revision_labels({'value': 'Number of Features', 'variable': 'Feature Kind'}),
    #     color='extractor'
    # )
    # revision_x_axis(fig)
    # fig.update_layout(legend=dict(yanchor='top', y=0.98, xanchor='left', x=0.01))
    fig = px.box(
        df,
        x='revision',
        y=[f'#{source}_features'],
        color='extractor',
        boxmode='overlay'
    )
    style_boxplot(fig)
    return fig

show(plot_features_by_kind_per_revision('dimacs_no_dead'))

In [82]:
# source lines of code

fig = px.scatter(
    df_kconfig,
    x='committer_date',
    y='source_lines_of_code',
    trendline='ols',
    labels=committer_date_labels({'source_lines_of_code': 'Number of Source Lines of Code'}),
    hover_data=['revision']
)

committer_date_x_axis(fig)
estimate_trend(fig, 'SLOC')
show(fig, 'sloc')

SLOC & 2,670 & 81,282 & 975,399 & 2,643,876 & 22,151,864 \\


In [38]:
# processor architectures
# todo: show timeouts, extraction failures etc. succinctly in this plot
# by adding a STATE column which is 'extracted', 'unsat', 'un#sat' etc. ad mapping it onto a symbol

fig = px.scatter( # or line
    df_architectures,
    x='committer_date',
    y='architecture',
    color='architecture',
    labels=committer_date_labels({'architecture': 'Processor Architecture'}),
    hover_data=['revision']
)

committer_date_x_axis(fig)
fig.update_layout(showlegend=False)
fig.update_yaxes(showticklabels=False)
fig.update_traces(marker_size=4)

df_architectures_first_version = df_architectures.groupby('architecture').min().reset_index()
for row in range(len(df_architectures_first_version)):
    fig.add_annotation(
        x=df_architectures_first_version.at[row, 'committer_date'],
        y=df_architectures_first_version.at[row, 'architecture'],
        text=df_architectures_first_version.at[row, 'architecture'],
        showarrow=False, yshift=0, xshift=-5, font_size=10, xanchor='right'
    )

show(fig, 'architectures')

In [151]:
# features

fig = px.scatter(
    df_kconfig,
    x='committer_date',
    y='model-features',
    trendline='ols',
    color='extractor'
)
estimate_trend(fig, 'kconfigreader', 0)
estimate_trend(fig, 'kmax', 1)

fig = px.scatter(
    df_kconfig,
    x='committer_date',
    y='model-features',
    color='architecture',
    labels=committer_date_labels({'model-features': 'Number of Features'}),
    hover_data=['revision', 'architecture'],
    trendline='ols'
)
committer_date_x_axis(fig)
fig.update_traces(marker_size=4)
# show(fig)

fig = px.box(
    df_kconfig.sort_values(by='committer_date'),
    x='revision',
    y='model-features',
    color='extractor',
    labels=revision_labels({'model-features': 'Number of Features', 'extractor': 'Extractor'}),
    boxmode='overlay'
)
revision_x_axis(fig)
fig.update_layout(legend=dict(yanchor='top', y=0.98, xanchor='left', x=0.01))
fig.update_traces(fillcolor='rgba(0,0,0,0)')
fig.update_traces(line_width=1)
fig.update_traces(marker_size=2)
show(fig, 'features', height=350)

kconfigreader & 3 & 99 & 1,183 & 2,406 & 26,065 \\
kmax & 2 & 64 & 772 & 1,851 & 17,298 \\


In [241]:
# model count (log10)

fig = px.scatter(
    df_solve,
    x='committer_date',
    y='model-count-log10',
    trendline='ols',
    color='extractor'
)
estimate_trend(fig, 'kconfigreader', 0)
#estimate_trend(fig, 'kmax', 1)

fig = px.scatter(
    df_solve,
    x='committer_date',
    y='model-count-log10',
    color='architecture',
    labels=committer_date_labels({'model-count-log10': 'Number of Configurations (log<sub>10</sub>)'}),
    hover_data=['revision', 'architecture']
)
committer_date_x_axis(fig)
log10_y_axis(fig)
fig.update_traces(marker_size=4)
# show(fig)

fig = px.box(
    df_solve.sort_values(by='committer_date'),
    x='revision',
    y='model-count-log10',
    color='extractor',
    labels=revision_labels({'model-count-log10': 'Number of Configurations (log<sub>10</sub>)', 'extractor': 'Extractor'}),
    boxmode='overlay'
)
revision_x_axis(fig)
log10_y_axis(fig)
fig.update_layout(legend=dict(yanchor='top', y=0.98, xanchor='left', x=0.01))
fig.update_traces(fillcolor='rgba(0,0,0,0)')
fig.update_traces(line_width=1)
fig.update_traces(marker_size=2)

model_count_group = df_solve.groupby('revision', dropna=False)['model-count-log10']
timeouts_per_revision = (model_count_group.size() - model_count_group.count()).reset_index()
for row in range(len(timeouts_per_revision)):
    timeouts = timeouts_per_revision.at[row, 'model-count-log10']
    timeouts = "" if timeouts == 0 else str(timeouts)
    fig.add_annotation(
        x=timeouts_per_revision.at[row, 'revision'],
        y=1,
        text=timeouts,
        showarrow=False,
        font_size=10, textangle=270, align='left', yanchor='bottom', yshift=10, font_color='gray'
    )

show(fig, 'model-count-log10', height=350)

# todo: sum model counts to get total model count, as we summed features for total features (to get an impression of the megamodel, as a new contribution)

kconfigreader & 0 & 2 & 18 & 509 & 874 \\


In [253]:
df2 = df[df['model-count-log10'] > 1]
fig = px.scatter(df2, x='committer_date', y='model-count-log10', hover_data=['revision'], color='extractor', trendline='ols')
print("~#configurations scaled per week (kconfigreader): " + str(int(pow(10, added_weekly(fig, 0)))))
print("~#configurations scaled per week (kmax): " + str(int(pow(10, added_weekly(fig, 1)))))
print("~#configurations today (kconfigreader): 10^" + str(int(estimate(fig, 0))))
print("~#configurations today (kmax): 10^" + str(int(estimate(fig, 1))))
fig

~#configurations scaled per week (kconfigreader): 37
~#configurations scaled per week (kmax): 12
~#configurations today (kconfigreader): 10^2322
~#configurations today (kmax): 10^1593
