In [429]:
import re
import plotly.express as px
import plotly.graph_objs as go
import pandas as pd
import statsmodels

output_directory = 'output-clean'
figures_directory = '../../paper-icse-2024-linux/figures'

def estimate_trend(fig, message='', idx=0, date1=pd.Timestamp.now() - pd.Timedelta(days=365.25*20), date2=pd.Timestamp.now()):
    results = px.get_trendline_results(fig)
    intercept = results.iloc[idx]['px_fit_results'].params[0]
    slope = results.iloc[idx]['px_fit_results'].params[1]
    daily = slope * pd.to_timedelta(1, unit='D').total_seconds()
    #weekly = slope * pd.to_timedelta(1, unit='W').total_seconds()
    monthly = slope * pd.to_timedelta(1, unit='D').total_seconds() * 30.437
    yearly = slope * pd.to_timedelta(1, unit='D').total_seconds() * 365.25
    on_date1 = intercept + slope * date1.timestamp()
    on_date2 = intercept + slope * date2.timestamp()
    print(f'{message} & {round(daily):,} & {round(monthly):,} & {round(yearly):,} & {round(on_date1):,} & {round(on_date2):,} \\\\')

def read_dataframe(stage):
    df = pd.read_csv(f'../{output_directory}/{stage}/output.csv')
    if 'committer_date_unix' in df:
        df['committer_date'] = df['committer_date_unix'].apply(lambda d: pd.to_datetime(d, unit='s'))
    return df

def peek_dataframe(df, column, message, type='str', filter=['revision', 'architecture', 'extractor']):
    success = df[~df[column].str.contains('NA') if type == 'str' else ~df[column].isna()][filter]
    failure = df[df[column].str.contains('NA') if type == 'str' else df[column].isna()][filter]
    print(f'{message}: {len(success)} successes, {len(failure)} failures')

df_kconfig = read_dataframe('kconfig')
peek_dataframe(df_kconfig, 'model-file', 'model extraction')

df_uvl = read_dataframe('model_to_uvl_featureide')
peek_dataframe(df_uvl, 'uvl-file', 'UVL transformation', 'na', ['model-file'])

df_xml = read_dataframe('model_to_xml_featureide')
peek_dataframe(df_xml, 'xml-file', 'XML transformation', 'na', ['model-file'])

df_smt = read_dataframe('model_to_smt_z3')
peek_dataframe(df_smt, 'smt-file', 'SMT transformation', 'na', ['model-file'])

df_dimacs = read_dataframe('dimacs')
peek_dataframe(df_dimacs, 'dimacs-file', 'CNF transformation')

df_architectures = read_dataframe(f'read-linux-architectures')
df_architectures = df_architectures.sort_values(by='committer_date')

def committer_date_x_axis(fig, df=df_kconfig, append_revision=True):
    axis = df_kconfig[['committer_date', 'revision']].drop_duplicates()
    axis['year'] = axis['committer_date'].apply(lambda d: str(d.year))
    axis = axis.sort_values(by='committer_date').groupby('year').nth(0).reset_index()
    fig.update_xaxes(
        ticktext=axis['year'].str.cat('<br><sup>' + axis['revision'].str[1:] + '</sup>')[1:] if append_revision else axis['year'],
        tickvals=axis['year'][1:]
    )

def revision_x_axis(fig, df=df_kconfig):
    axis = df_kconfig[['committer_date', 'revision']].drop_duplicates()
    axis['year'] = axis['committer_date'].apply(lambda d: str(d.year))
    axis = axis.sort_values(by='committer_date').groupby('year').nth(0).reset_index()
    fig.update_xaxes(
        ticktext=axis['year'],
        tickvals=axis['revision']
    )

def committer_date_labels(dict={}):
    return {'committer_date': 'Year / First Release in Year'} | dict

def revision_labels(dict={}):
    return {'revision': 'Year'} | dict

def show(fig, name=None, width=1000, height=500, crop=True):
    fig.update_layout(width=width, height=height)
    if crop:
        fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    if figures_directory and name:
        fig.write_image(f'{figures_directory}/{name}.pdf')
    fig.show()

#df['model-count-log10'] = (df['model-count']).fillna(0).map(lambda s: len(str(s))) #todo

# count nontseitin,nondead variables as features

model extraction: 6246 successes, 46 failures
UVL transformation: 4991 successes, 1255 failures
XML transformation: 6215 successes, 31 failures
SMT transformation: 6246 successes, 0 failures
CNF transformation: 6246 successes, 0 failures


In [431]:
# source lines of code

fig = px.scatter(
    df_kconfig,
    x='committer_date',
    y='source_lines_of_code',
    trendline='ols',
    labels=committer_date_labels({'source_lines_of_code': 'Number of Source Lines of Code'}),
    hover_data=['revision']
)

committer_date_x_axis(fig)
estimate_trend(fig, 'SLOC')
show(fig, 'sloc')

SLOC & 2,670 & 81,282 & 975,399 & 2,625,588 & 22,133,576 \\


In [414]:
# processor architectures

fig = px.line(
    df_architectures,
    x='committer_date',
    y='architecture',
    color='architecture',
    labels=committer_date_labels({'architecture': 'Processor Architecture'}),
    hover_data=['revision']
)

committer_date_x_axis(fig)
fig.update_layout(showlegend=False)
fig.update_yaxes(showticklabels=False)

df_architectures_first_version = df_architectures.groupby(['architecture']).min().reset_index()
for row in range(len(df_architectures_first_version)):
    fig.add_annotation(
        x=df_architectures_first_version.at[row, 'committer_date'],
        y=df_architectures_first_version.at[row, 'architecture'],
        text=df_architectures_first_version.at[row, 'architecture'],
        showarrow=False, yshift=0, xshift=-5, font_size=10, xanchor='right'
    )

show(fig, 'architectures')

In [430]:
# features

fig = px.scatter(
    df_kconfig,
    x='committer_date',
    y='model-features',
    trendline='ols',
    color='extractor'
)
estimate_trend(fig, 'kconfigreader', 0)
estimate_trend(fig, 'kmax', 1)

fig = px.scatter(
    df_kconfig,
    x='committer_date',
    y='model-features',
    color='architecture',
    labels=committer_date_labels({'model-features': 'Number of Features'}),
    hover_data=['revision', 'architecture']
)
committer_date_x_axis(fig)
fig.update_traces(marker_size=4)
# show(fig)

fig = px.box(
    df_kconfig.sort_values(by='committer_date'),
    x='revision',
    y='model-features',
    color='extractor',
    labels=revision_labels({'model-features': 'Number of Features', 'extractor': 'Extractor'}),
    boxmode='overlay'
)
revision_x_axis(fig)
fig.update_layout(legend=dict(yanchor='top', y=0.98, xanchor='left', x=0.01))
fig.update_traces(fillcolor='rgba(0,0,0,0)')
fig.update_traces(line_width=1)
fig.update_traces(marker_size=2)
show(fig, 'features', height=350)

kconfigreader & 3 & 99 & 1,183 & 2,384 & 26,043 \\
kmax & 2 & 64 & 772 & 1,837 & 17,284 \\


In [2]:
# features (common, shared, owned)

import glob
import re

def inspect_architecture_features_for_revision(extractor, revision):
    architectures = [re.search('\[(.*)\]', f).group(1) for f in glob.glob(f'../{output_directory}/{extractor}/linux/{revision}[*.features')]
    architectures = list(set(architectures))
    architectures.sort()
    data = []
    for architecture in architectures:
        features_filename = f'../{output_directory}/{extractor}/linux/{revision}[{architecture}].features'
        with open(features_filename, 'r') as f:
            features = set([f.strip() for f in f.readlines()])
        data.append({'extractor': extractor, 'revision': revision, 'architecture': architecture, 'features': features, '#features': len(features)})
    common_features = set.intersection(*[d['features'] for d in data])
    for architecture in architectures:
        descriptor = [d for d in data if d['architecture'] == architecture][0]
        other_features = set.union(*[d['features'] for d in data if d['architecture'] != architecture])
        descriptor['#common_features'] = len(common_features)
        descriptor['owned_features'] = descriptor['features'].difference(other_features)
        descriptor['#owned_features'] = len(descriptor['owned_features'])
        descriptor['#shared_features'] = len(descriptor['features'].difference(common_features).difference(descriptor['owned_features']))
    return data

def inspect_architecture_features(extractor):
    revisions = [re.search('linux/(.*)\[', f).group(1) for f in glob.glob(f'../{output_directory}/{extractor}/linux/*.features')]
    revisions = list(set(revisions))
    revisions.sort()
    data = []
    for revision in revisions:
        print(revision)
        data += inspect_architecture_features_for_revision(extractor, revision)
    return pd.DataFrame(data).drop(columns=['features', 'owned_features'])

arch_features = inspect_architecture_features('kconfigreader')

KeyError: "['features', 'owned_features'] not found in axis"

In [224]:
arch2 = arch_features.melt(id_vars=['extractor', 'revision', 'architecture'], value_vars=['#features', '#common_features', '#owned_features', '#shared_features'])
fig = px.box(arch2, x='revision', y='value', color='variable')
fig
# todo: calculate core dead features and remove(?) them from here, or plot them, and find a "good" definition of what it means for an architecture to "include" a feature
#.e.g., dead feature occur in a formula, but not really in the architecture (related work does not do this)
# i guess there may be many core/dead features per arch, as later all arch's have been unified and decision propagation does the rest => i.e., it is important to analyze formulas, not only syntax of kconfig, so our computation effort is justified

In [219]:
arch2 = arch_features.melt(id_vars=['extractor', 'revision', 'architecture'], value_vars=['#common_features', '#shared_features', '#owned_features'])
fig = px.bar(arch2[arch2['architecture'] == 'x86'], x='revision', y='value', color='variable')
fig = px.bar(arch2, x='revision', y='value', color='variable', log_y=True)
fig

In [76]:
fig = px.scatter(df, x='committer_date', y='dimacs-variables', hover_data=['revision'], trendline='ols', color='extractor')
print("~#variables added per week (kconfigreader): " + str(int(added_weekly(fig, 0))))
print("~#variables added per week (kmax): " + str(int(added_weekly(fig, 1))))
fig

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['system', 'revision', 'architecture', 'binding-file', 'kconfig-file', 'environment', 'model-file', 'model-features', 'model-variables', 'model-literals', 'model-time'] but received: committer_date

In [253]:
df2 = df[df['model-count-log10'] > 1]
fig = px.scatter(df2, x='committer_date', y='model-count-log10', hover_data=['revision'], color='extractor', trendline='ols')
print("~#configurations scaled per week (kconfigreader): " + str(int(pow(10, added_weekly(fig, 0)))))
print("~#configurations scaled per week (kmax): " + str(int(pow(10, added_weekly(fig, 1)))))
print("~#configurations today (kconfigreader): 10^" + str(int(estimate(fig, 0))))
print("~#configurations today (kmax): 10^" + str(int(estimate(fig, 1))))
fig

~#configurations scaled per week (kconfigreader): 37
~#configurations scaled per week (kmax): 12
~#configurations today (kconfigreader): 10^2322
~#configurations today (kmax): 10^1593


In [194]:
df = pd.read_csv('../output/kconfig/output.csv')
df['committer_date_unix'] = df['committer_date_unix'].apply(lambda d: pd.to_datetime(d, unit='s'))

fig = px.scatter(df, x='model-features', y='revision-parameter', color='extractor', size='model-literals')
fig.show()
#todo: for history, consider boxplot

In [198]:
fig = px.box(df, x='committer_date_unix', y='model-features', color='extractor')
fig.show()
#todo: for history, consider boxplot