In [1]:
%%capture
# pip install plotly pandas statsmodels kaleido scipy nbformat jinja2

In [2]:
import glob
import re
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pandas as pd
import numpy as np
import os.path
import json
import pickle
import scipy
from statistics import mean, stdev
from math import sqrt, log10
from packaging.version import Version


In [3]:

def read_dataframe(stage, dtype={}, usecols=None, file=None, output_directory="output-linux"):
    if not file:
        file = 'output'
    df = pd.read_csv(f'{output_directory}/{stage}/{file}.csv', dtype=dtype, usecols=usecols)
    if 'committer_date_unix' in df:
        df['committer_date'] = df['committer_date_unix'].apply(lambda d: pd.to_datetime(d, unit='s'))
    return df

In [4]:
# helper functions for drawing plots

def estimate_group(group):
    print('\\hspace{2mm} ' + group + ' \\\\')

def estimate_trend(fig, color=None, color_value=None, xs=[], key=lambda x: x.timestamp()):
    results = px.get_trendline_results(fig)
    if color is not None and color_value is not None:
        idx = [i for i, r in enumerate(results.iloc) if r[color] == color_value]
        if idx != []:
            idx = idx[0]
        else:
            idx = 0
    else:
        idx = 0
    intercept = results.iloc[idx]['px_fit_results'].params[0]
    slope = results.iloc[idx]['px_fit_results'].params[1]
    daily = slope * pd.to_timedelta(1, unit='D').total_seconds()
    weekly = slope * pd.to_timedelta(7, unit='D').total_seconds()
    monthly = slope * pd.to_timedelta(1, unit='D').total_seconds() * 30.437
    yearly = slope * pd.to_timedelta(1, unit='D').total_seconds() * 365.25
    return daily, weekly, monthly, yearly, [intercept + slope * key(x) for x in xs]

def log10_y_axis(fig):
    fig.update_yaxes(tickprefix = "10<sup>", ticksuffix = "</sup>")

def percentage_y_axis(fig):
    fig.layout.yaxis.tickformat = ',.0%'

def format_percentage(value):
    return str(round(value * 100, 2)) + '%'

def committer_date_labels(dict={}):
    return {'committer_date': 'Year<br><sup>First Release in Year</sup>'} | dict

def revision_labels(dict={}):
    return {'revision': 'Year'} | dict

def style_legend(fig, position='topleft', xshift=0, yshift=0):
    if position == 'topleft':
        fig.update_layout(legend=dict(yanchor='top', y=0.98 + yshift, xanchor='left', x=0.01 + xshift))
    elif position == 'topright':
        fig.update_layout(legend=dict(yanchor='top', y=0.98 + yshift, xanchor='right', x=0.98 + xshift))
    elif position == 'bottomright':
        fig.update_layout(legend=dict(yanchor='bottom', y=0.01 + yshift, xanchor='right', x=0.98 + xshift))
    elif position == 'bottomleft':
        fig.update_layout(legend=dict(yanchor='bottom', y=0.01 + yshift, xanchor='left', x=0.01 + xshift))
    else:
        fig.update_layout(showlegend=False)

def style_box(fig, legend_position='topleft', xshift=0, yshift=0):
    fig.update_traces(fillcolor='rgba(0,0,0,0)')
    fig.update_traces(line_width=1)
    fig.update_traces(marker_size=2)
    fig.update_layout(font_family="Linux Biolinum")
    style_legend(fig, legend_position, xshift, yshift)

def style_scatter(fig, marker_size=4, legend_position='topleft', xshift=0, yshift=0):
    if marker_size:
        fig.update_traces(marker_size=marker_size)
    style_legend(fig, legend_position, xshift, yshift)
    fig.update_layout(font_family="Linux Biolinum")

def plot_failures(fig, df, x, y, y_value, align='bottom', xref='x', font_size=10, textangle=270):
    group = df.groupby(x, dropna=False)
    failures = (group[y].size() - group[y].count()).reset_index().rename(columns={y: f'{y}_failures'})
    attempts = group[y].size().reset_index().rename(columns={y: f'{y}_attempts'})
    failures = pd.merge(failures, attempts)
    failures[f'{y}_text'] = failures[f'{y}_failures'].astype(str) + ' (' + (failures[f'{y}_failures'] / failures[f'{y}_attempts']).apply(lambda v: "{0:.1f}%".format(v * 100)) + ')'
    for row in range(len(failures)):
        text = failures.at[row, f'{y}_text']
        text = "" if failures.at[row, f'{y}_failures'] == 0 else text
        fig.add_annotation(
            x=failures.at[row, x],
            y=y_value,
            text=text,
            showarrow=False,
            font_size=font_size,
            textangle=textangle,
            align='left' if align == 'bottom' else 'right',
            yanchor='bottom' if align == 'bottom' else 'top',
            yshift=5 if align == 'bottom' else -5,
            font_color='gray',
            xref=xref
        )

def cohens_d(d1, d2):
    # uses pooled standard deviation
    n1, n2 = len(d1), len(d2)
    s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
    s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
    u1, u2 = np.mean(d1), np.mean(d2)
    return (u1 - u2) / s

def wilcoxon_test(df, column_a, column_b):
    # if the same values are returned for many inputs, refer to https://stats.stackexchange.com/q/232927
    a = df[column_a][~df[column_a].isna()]
    b = df[column_b][~df[column_b].isna()]
    d = a - b
    results = scipy.stats.wilcoxon(d, method='approx')
    p = results.pvalue
    # adapted from https://stats.stackexchange.com/q/133077
    r = np.abs(results.zstatistic / np.sqrt(len(d) * 2))
    return p, r

def style_p_values(fig, brackets, scale=0, _format=dict(interline=0.07, text_height=1.07, color='gray')):
    # adapted from https://stackoverflow.com/q/67505252
    for entry in brackets:
        first_column, second_column, y, results = entry
        y_range = [1.01+y*_format['interline'], 1.02+y*_format['interline']]
        p, r = results
        if p >= 0.05:
            symbol = 'ns'
        elif p >= 0.01: 
            symbol = '*'
        elif p >= 0.001:
            symbol = '**'
        else:
            symbol = '***'
        first_column = first_column - scale
        second_column = second_column + scale
        fig.add_shape(type="line",
            xref="x", yref="y domain",
            x0=first_column, y0=y_range[0],
            x1=first_column, y1=y_range[1],
            line=dict(color=_format['color'], width=2,)
        )
        fig.add_shape(type="line",
            xref="x", yref="y domain",
            x0=first_column, y0=y_range[1], 
            x1=second_column, y1=y_range[1],
            line=dict(color=_format['color'], width=2,)
        )
        fig.add_shape(type="line",
            xref="x", yref="y domain",
            x0=second_column, y0=y_range[0], 
            x1=second_column, y1=y_range[1],
            line=dict(color=_format['color'], width=2,)
        )
        fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
            x=(first_column + second_column)/2,
            y=y_range[1]*_format['text_height'],
            showarrow=False,
            text=symbol + ' <sup>(' + str(round(r, 2)) + ')</sup>',
            textangle=0,
            xref="x",
            yref="y domain"
        ))
    return fig

def bracket_for(i, j, xshift, y, results):
    return [i + xshift, j + xshift, y, results]

def filter_extractor(df, extractor):
    return df[df['extractor'] == extractor]

def annotate_value(fig, x, y, subplot, prefix, ax, ay, xanchor, df, fn=lambda prefix, y: prefix + ': ' + format(round(y), ',') if y > 0 else prefix):
    if df.empty:
        return
    if isinstance(x, str):
        x = df[x].iat[0]
    if isinstance(y, str):
        y = df[y].iat[0]
    fig.add_annotation(
        xref='x' + str(subplot),
        yref='y' + str(subplot),
        x=x,
        y=y,
        ax=ax,
        ay=ay,
        xanchor=xanchor,
        text=fn(prefix, y)
    )

def show(fig, name=None, width=1000, height=500, margin=None):
    # fig.update_layout(width=width, height=height)
    if margin:
        fig.update_layout(margin=margin)
    else:
        fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
    
    # if figures_directory and os.path.isdir(figures_directory) and name:
        # fig.write_image(f'{figures_directory}/{name}.pdf')
    # fig.write_html(f'{figures_directory}/{name}.html',config={"responsive":True})
        
    fig.show()

In [5]:
latestData = dict()

In [6]:
def group_by_arch(df):
    grouped = df.groupby('architecture')
    dfs = {arch: group for arch, group in grouped}
    return dfs

def read_dataframe(stage, dtype={}, usecols=None, file=None, arch=None, output_dir="output-linux"):
    if not file:
        file = 'output'
    df = pd.read_csv(f'{output_dir}/{stage}/{file}.csv', dtype=dtype, usecols=usecols)
    if 'committer_date_unix' in df:
        df['committer_date'] = df['committer_date_unix'].apply(lambda d: pd.to_datetime(d, unit='s'))
    if arch != None:
        return group_by_arch(df)[arch]
    return df

def read_dataframe_linux(stage, dtype={}, usecols=None, file=None, arch=None):
    if not file:
        file = 'output'
    df = pd.read_csv(f'output-linux/{stage}/{file}.csv', dtype=dtype, usecols=usecols)
    if 'committer_date_unix' in df:
        df['committer_date'] = df['committer_date_unix'].apply(lambda d: pd.to_datetime(d, unit='s'))
    if arch != None:
        return group_by_arch(df)[arch]
    return df


def replace_values(df):
    df.replace('kconfigreader', 'KConfigReader', inplace=True)
    df.replace('kmax', 'KClause', inplace=True)

def big_log10(str):
    return log10(int(str)) if not pd.isna(str) and str != '' else pd.NA

def process_model_count(df_solve):
    df_solve['model-count'] = df_solve['model-count'].replace('1', '')
    df_solve['model-count-log10'] = df_solve['model-count'].fillna('').apply(big_log10).replace(0, np.nan)
    df_solve['year'] = df_solve['committer_date'].apply(lambda d: int(d.year))

def peek_dataframe(df, column, message, type='str', filter=['revision', 'architecture', 'extractor']):
    success = df[~df[column].str.contains('NA') if type == 'str' else ~df[column].isna()][filter]
    failure = df[df[column].str.contains('NA') if type == 'str' else df[column].isna()][filter]
    print(f'{message}: {len(success)} successes, {len(failure)} failures')
    
def jaccard(a, b):
    return len(set.intersection(a, b)) / len(set.union(a, b))

def add_features(descriptor, source, features, min=2):
    descriptor[f'#{source}'] = len(features) if features is not None and len(features) >= min else np.nan

def get_variables(variable_map):
    variables = set(variable_map.values())
    if len(variables) <= 1:
        variables = set()
    return variables

In [7]:
class Linux:
    
    def read_dataframe(self, stage, dtype={}, usecols=None, file=None, arch=None):
        if not file:
            file = 'output'
        df = pd.read_csv(f'{self.output_directory}/{stage}/{file}.csv', dtype=dtype, usecols=usecols)
        if 'committer_date_unix' in df:
            df['committer_date'] = df['committer_date_unix'].apply(lambda d: pd.to_datetime(d, unit='s'))
        if arch != None:
            return group_by_arch(df)[arch]
        return df
    
    def __init__(self):
        self.output_directory = "output-linux"
        self.df_kconfig = self.read_dataframe('kconfig')
        self.df_kconfig['year'] = self.df_kconfig['committer_date'].apply(lambda d: int(d.year))
        self.df_architectures = self.read_dataframe(f'read-linux-architectures')
        self.df_architectures = self.df_architectures.sort_values(by='committer_date')
        self.df_architectures['year'] = self.df_architectures['committer_date'].apply(lambda d: int(d.year))
        self.df_configs = self.read_dataframe(f'read-linux-configs')
        self.df_configs = self.df_configs[~self.df_configs['kconfig-file'].str.contains('/um/')]
        self.df_config_types = self.read_dataframe(f'read-linux-configs', file='output.types')
        self.df_config_types = self.df_config_types[~self.df_config_types['kconfig-file'].str.contains('/um/')]
        self.df_config_types = self.df_config_types.merge(self.df_architectures[['revision', 'committer_date']].drop_duplicates())
        self.df_uvl = self.read_dataframe('model_to_uvl_featureide')
        self.df_smt = self.read_dataframe('model_to_smt_z3')
        self.df_dimacs = self.read_dataframe('dimacs')
        self.df_backbone_dimacs = self.read_dataframe('backbone-dimacs')
        self.df_solve = self.read_dataframe('solve_model-count', {'model-count': 'string'})
        # differentiate kinds of features
        self.df_configs_configurable = self.df_configs.copy()
        self.df_configs_configurable['configurable'] = False
        with open(f'{self.output_directory}/linux-features.dat', 'rb') as f:
            [self.features_by_kind_per_architecture, self.df_extractor_comparison, self.potential_misses_grep, self.potential_misses_kmax, self.df_configs_configurable] = pickle.load(f)

        replace_values(self.features_by_kind_per_architecture)
        self.df_features = pd.merge(self.df_architectures, self.features_by_kind_per_architecture, how='outer').sort_values(by='committer_date')
        self.df_features = pd.merge(self.df_kconfig, self.df_features, how='outer').sort_values(by='committer_date')
        self.df_total_features = self.df_features.groupby(['extractor', 'revision']).agg({'#total_features': 'min'}).reset_index()
        self.df_total_features = pd.merge(self.df_kconfig[['committer_date', 'revision']].drop_duplicates(), self.df_total_features)


In [8]:
linux_dfs = Linux()

In [None]:
pio.templates['colorblind'] = go.layout.Template(layout_colorway=['#648FFF', '#FE6100', '#785EF0', '#DC267F', '#FFB000'])
pio.templates.default = 'plotly_white+colorblind'

def evaluate_features(df, extractor, date, y, median=True):
    rows = df[(df['extractor'] == extractor) & (df['committer_date'] >= date)].sort_values(by='committer_date')
    if len(rows) > 0:
        if median:
            return rows[rows['committer_date']==rows.iloc[0]['committer_date']][y].median()
        else:
            return rows.iloc[0][y]


def print_evaluation(fig, df, y, extractor, label, growth_prefix='', prefix='', postfix=''):
    dates = [pd.Timestamp('2004-01-01'), pd.Timestamp('2014-01-01'), pd.Timestamp('2024-01-01'), pd.Timestamp('2034-01-01'), pd.Timestamp('2044-01-01')]
    daily, weekly, monthly, yearly, estimated_values = estimate_trend(fig, 'extractor', extractor, dates)
    values = []
    for (date, estimated_value) in zip(dates, estimated_values):
        actual_value = evaluate_features(df, extractor, date, y)
        if actual_value:
            values.append(f"{{\\color{{gray}}(${prefix}\\text{{{round(actual_value):,}}}{postfix}$)}}")
        else:
            values.append(f"${prefix}\\text{{{round(estimated_value):,}}}{postfix}$")
    sign = lambda x: '\color{green}' if round(x) > 0 else ('\color{red}' if round(x) < 0 else '')
    print('\hspace*{4mm} ' + f'{label} & ${sign(daily)}{growth_prefix}{prefix}\\text{{{round(daily):,}}}{postfix}$ & ${sign(weekly)}{growth_prefix}{prefix}\\text{{{round(weekly):,}}}{postfix}$ & ${sign(monthly)}{growth_prefix}{prefix}\\text{{{round(monthly):,}}}{postfix}$ & ${sign(yearly)}{growth_prefix}{prefix}\\text{{{round(yearly):,}}}{postfix}$ ' + " ".join([f"& {value}" for value in values]) + ' \\\\')

def estimate_features(df, y, name):
    fig = px.scatter(
        df,
        x='committer_date',
        y=y,
        trendline='ols',
        color='extractor'
    )
    estimate_group(name)
    for (extractor, label) in [('KConfigReader', '\\kcr'), ('KClause', '\\kcl')]:
        print_evaluation(fig, df, y, extractor, label, '+\,')

def pearson_r(extractor, df):
    if len(set(df['#total_features'])) >= 1 and len([f for f in set(df['#total_features']) if pd.isna(f)]) == 0:
        s = scipy.stats.pearsonr(df['committer_date'].astype(int) // 10 ** 9, df['#total_features'])
        print(f'pearson for {extractor}: ' + str(round(s.statistic, 2)) + ', ' + str(round(s.pvalue, 2)))

df_total_features = linux_dfs.df_features.groupby(['extractor', 'revision']).agg({'#total_features': 'min'}).reset_index()
df_total_features = pd.merge(linux_dfs.df_kconfig[['committer_date', 'revision']].drop_duplicates(), df_total_features)

estimate_features(linux_dfs.df_features.groupby(['extractor', 'revision', 'committer_date']).min('#total_features').reset_index(), f'#total_features', 'Total')
estimate_features(linux_dfs.df_features[linux_dfs.df_features['architecture'] == 'arm'], '#features', '\\arch{arm}')
estimate_features(linux_dfs.df_features[(linux_dfs.df_features['architecture'] == 'i386') | (linux_dfs.df_features['architecture'] == 'x86')], '#features', '\\arch{x86}')

pearson_r('KConfigReader', df_total_features[df_total_features['extractor']=='KConfigReader'])
pearson_r('KClause', df_total_features[df_total_features['extractor']=='KClause'])

for extractor in ['KConfigReader', 'KClause']:
    for arch in set(linux_dfs.df_architectures['architecture'].drop_duplicates()):
        pearson_r(f'{arch} ({extractor})', linux_dfs.df_features[(linux_dfs.df_features['extractor']==extractor)&(linux_dfs.df_features['architecture']==arch)])

print('#features per architecture (KConfigReader):')
print('min=' + str(linux_dfs.df_features[linux_dfs.df_features['extractor']=='KConfigReader']['#features'].min()))
print('median=' + str(linux_dfs.df_features[linux_dfs.df_features['extractor']=='KConfigReader']['#features'].median()))
print('max=' + str(linux_dfs.df_features[linux_dfs.df_features['extractor']=='KConfigReader']['#features'].max()))

print('#features per architecture (KClause):')
print('min=' + str(linux_dfs.df_features[linux_dfs.df_features['extractor']=='KClause']['#features'].min()))
print('median=' + str(linux_dfs.df_features[linux_dfs.df_features['extractor']=='KClause']['#features'].median()))
print('max=' + str(linux_dfs.df_features[linux_dfs.df_features['extractor']=='KClause']['#features'].max()))

df = linux_dfs.df_features[linux_dfs.df_features['extractor']=='KClause']['#features']/linux_dfs.df_features[linux_dfs.df_features['extractor']=='KClause']['#total_features']
print('number of architecture features per total features (KClause):')
print('min=' + str(df.min()))
print('median=' + str(df.median()))
print('max=' + str(df.max()))

fig = px.scatter(
    df_total_features.sort_values(by='committer_date'),
    x='committer_date',
    y='#total_features',
    facet_col='extractor',
    labels={'#total_features': '#Features (Total)', 'extractor': 'Extractor', 'committer_date': 'Year'},
    category_orders={'extractor': ['KConfigReader', 'KClause']}
)
style_scatter(fig)
fn = lambda prefix, y: format(round(y), ',')
annotate_value(fig, 'committer_date', 0, 1, 'v2.5.45', 0, -15, 'center', linux_dfs.df_features[linux_dfs.df_features['revision'] == 'v2.5.45'])
annotate_value(fig, 'committer_date', 0, 1, 'v6.11', -10, -15, 'center', linux_dfs.df_features[linux_dfs.df_features['revision'] == 'v6.11'])
annotate_value(fig, 'committer_date', '#total_features', 1, 'KConfigReader', 40, 0, 'left',
               linux_dfs.df_features[(linux_dfs.df_features['extractor'] == 'KConfigReader')&(linux_dfs.df_features['revision'] == 'v2.5.45')], fn)
annotate_value(fig, 'committer_date', '#total_features', 1, 'KConfigReader', -10, 30, 'right',
               linux_dfs.df_features[(linux_dfs.df_features['extractor'] == 'KConfigReader')&(linux_dfs.df_features['revision'] == 'v6.11')], fn)
annotate_value(fig, 'committer_date', 0, 2, 'v2.5.45', 0, -15, 'center', linux_dfs.df_features[linux_dfs.df_features['revision'] == 'v2.5.45'])
annotate_value(fig, 'committer_date', 0, 2, 'v6.11', -10, -15, 'center', linux_dfs.df_features[linux_dfs.df_features['revision'] == 'v6.11'])
annotate_value(fig, 'committer_date', '#total_features', 2, 'KClause', 40, 0, 'left',
               linux_dfs.df_features[(linux_dfs.df_features['extractor'] == 'KClause')&(linux_dfs.df_features['revision'] == 'v2.5.45')], fn)
annotate_value(fig, 'committer_date', '#total_features', 2, 'KClause', -10, 30, 'right',
               linux_dfs.df_features[(linux_dfs.df_features['extractor'] == 'KClause')&(linux_dfs.df_features['revision'] == 'v6.11')], fn)
fig.update_yaxes(tickprefix = "   ")
fig.update_xaxes(range=["2002-01-01", "2024-12-01"])
fig.update_yaxes(range=[0, 20500])
show(fig, 'total-features', height=220, width=750, margin=dict(l=0, r=0, t=20, b=0))
fig = px.scatter(
    linux_dfs.features,
    x='committer_date',
    y=f'#features',
    color='architecture',
    labels={f'#features': '#Features (Arch.)', 'extractor': 'Extractor', 'committer_date': 'Year'},
    hover_data=['revision', 'architecture'],
    facet_col='extractor',
    category_orders={'extractor': ['KConfigReader', 'KClause']}
)
style_scatter(fig, legend_position=None, marker_size=2.5)
annotate_value(fig, 'committer_date', 0, 1, 'v4.16', 0, -15, 'center', linux_dfs.features[ linux_dfs.features['revision'] == 'v4.16'])
annotate_value(fig, 'committer_date', 0, 1, 'v6.11', -10, -15, 'center', linux_dfs.features[ linux_dfs.features['revision'] == 'v6.11'])
annotate_value(fig, 'committer_date', '#features', 1, 'arm', -10, -20, 'right', linux_dfs.features[( linux_dfs.features['extractor'] == 'KConfigReader')&( linux_dfs.features['architecture'] == 'arm')&( linux_dfs.features['revision'] == 'v6.11')],)
annotate_value(fig, 'committer_date', '#features', 1, 'x86', -100, -20, 'right',linux_dfs.features[( linux_dfs.features['extractor'] == 'KConfigReader')&( linux_dfs.features['architecture'] == 'x86')&( linux_dfs.features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 1, 'arm64', -120, 0, 'right',linux_dfs.features[( linux_dfs.features['extractor'] == 'KConfigReader')&( linux_dfs.features['architecture'] == 'arm64')&( linux_dfs.features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 1, 'nios2', -5, 40, 'right', linux_dfs.features[( linux_dfs.features['extractor'] == 'KConfigReader')&( linux_dfs.features['architecture'] == 'nios2')&( linux_dfs.features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 1, 'score', 20, 0, 'left',   linux_dfs.features[( linux_dfs.features['extractor'] == 'KConfigReader')&( linux_dfs.features['architecture'] == 'score')&( linux_dfs.features['revision'] == 'v4.16')])
annotate_value(fig, 'committer_date', 0, 2, 'v4.16', 0, -15, 'center', linux_dfs.features[ linux_dfs.features['revision'] == 'v4.16'])
annotate_value(fig, 'committer_date', 0, 2, 'v6.11', -10, -15, 'center', linux_dfs.features[ linux_dfs.features['revision'] == 'v6.11'])
annotate_value(fig, 'committer_date', '#features', 2, 'arm', -10, -20, 'right', linux_dfs.features[( linux_dfs.features['extractor'] == 'KClause')&( linux_dfs.features['architecture'] == 'arm')&( linux_dfs.features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 2, 'x86', -100, -20, 'right',linux_dfs.features[( linux_dfs.features['extractor'] == 'KClause')&( linux_dfs.features['architecture'] == 'x86')&( linux_dfs.features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 2, 'arm64', -120, 0, 'right',linux_dfs.features[( linux_dfs.features['extractor'] == 'KClause')&( linux_dfs.features['architecture'] == 'arm64')&( linux_dfs.features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 2, 'nios2', -5, 40, 'right', linux_dfs.features[( linux_dfs.features['extractor'] == 'KClause')&( linux_dfs.features['architecture'] == 'nios2')&( linux_dfs.features['revision'] == 'v6.11')])
annotate_value(fig, 'committer_date', '#features', 2, 'score', 20, 0, 'left',   linux_dfs.features[( linux_dfs.features['extractor'] == 'KClause')&( linux_dfs.features['architecture'] == 'score')&( linux_dfs.features['revision'] == 'v4.16')])
fig.update_yaxes(tickprefix = "    ")
fig.update_xaxes(range=["2002-01-01", "2024-12-01"])
fig.update_yaxes(range=[0, 21000])
show(fig, 'features', height=220, width=750)



  sign = lambda x: '\color{green}' if round(x) > 0 else ('\color{red}' if round(x) < 0 else '')
  sign = lambda x: '\color{green}' if round(x) > 0 else ('\color{red}' if round(x) < 0 else '')
  print('\hspace*{4mm} ' + f'{label} & ${sign(daily)}{growth_prefix}{prefix}\\text{{{round(daily):,}}}{postfix}$ & ${sign(weekly)}{growth_prefix}{prefix}\\text{{{round(weekly):,}}}{postfix}$ & ${sign(monthly)}{growth_prefix}{prefix}\\text{{{round(monthly):,}}}{postfix}$ & ${sign(yearly)}{growth_prefix}{prefix}\\text{{{round(yearly):,}}}{postfix}$ ' + " ".join([f"& {value}" for value in values]) + ' \\\\')
  print_evaluation(fig, df, y, extractor, label, '+\,')


\hspace{2mm} Total \\
\hspace*{4mm} \kcr & $\color{green}+\,\text{2}$ & $\color{green}+\,\text{16}$ & $\color{green}+\,\text{69}$ & $\color{green}+\,\text{825}$ & {\color{gray}($\text{3,489}$)} & {\color{gray}($\text{12,938}$)} & {\color{gray}($\text{19,444}$)} & $\text{28,710}$ & $\text{36,962}$ \\
\hspace*{4mm} \kcl & $\color{green}+\,\text{2}$ & $\color{green}+\,\text{16}$ & $\color{green}+\,\text{70}$ & $\color{green}+\,\text{845}$ & {\color{gray}($\text{3,490}$)} & {\color{gray}($\text{13,520}$)} & {\color{gray}($\text{19,621}$)} & $\text{29,361}$ & $\text{37,806}$ \\
\hspace{2mm} \arch{arm} \\
\hspace*{4mm} \kcr & $\color{green}+\,\text{2}$ & $\color{green}+\,\text{13}$ & $\color{green}+\,\text{55}$ & $\color{green}+\,\text{662}$ & {\color{gray}($\text{2,108}$)} & {\color{gray}($\text{8,479}$)} & {\color{gray}($\text{15,036}$)} & $\text{21,927}$ & $\text{28,550}$ \\
\hspace*{4mm} \kcl & $\color{green}+\,\text{2}$ & $\color{green}+\,\text{13}$ & $\color{green}+\,\text{55}$ & $\col

In [None]:
linux_dfs.total_features()

In [None]:
def latest_for(df, column, committer_date):
    x = df.sort_values(by=[committer_date])
    return x.tail(1)[column]

In [None]:
def by_revision(df):
    x = df[df['revision'].str.contains("\w\d+\.0$", regex=True)]
    if len(x) == 0:
        x = df.sort_values(by=["revision"])
    return x

In [None]:
def find_revision(df, revision):
    x = df[df['revision'].str.contains(revision, regex=False)]
    return x

In [None]:
def for_arch(df, arch):
    return df[df['architecture'] == arch]

In [None]:
x = for_arch(df_kconfig, "x86_64")
x = by_revision(x)
x["revision"]

In [None]:
import json
def write_object_to_file(obj, name):
    with open(name, 'w') as fp:
        json.dump(obj, fp)
def read_json(path):
    with open(path) as json_data:
        return json.load(json_data)

In [None]:
def get_metrics_sloc_linux():
    output_directory = "output-linux"
    df_kconfig = read_dataframe("kconfig", output_dir=output_directory, )
    archs = df_kconfig["architecture"].unique()
    vals = dict()
    for arch in archs:
        df_arch = for_arch(df_kconfig, arch)
        df_arch = by_revision(df_arch)
        sloc = int(
            latest_for(df_arch, "source_lines_of_code", "committer_date_unix").iloc[0]
        )
        last_rev = latest_for(df_arch, "revision", "committer_date_unix").iloc[0]
        major = int(last_rev[1])
        before_last = df_arch[df_arch['revision'].str.contains(f"\w{major-1}\.\d$", regex=True)]
        if len(before_last) == 0:
            vals[f"linux/{arch}"] = {
                "source_lines_of_code": {
                    "currentValue": sloc,
                    "cmpLastRevision": "+100% (No Prior Revision)",
                }
            }
            continue
        before_last = before_last["source_lines_of_code"]
        before_last = int(before_last.iloc[0])
        value = round(100 * (sloc - before_last) / before_last, 2)
        vals[f"linux/{arch}"] = {
            "source_lines_of_code": {
                "currentValue": f"{sloc} loc",
                "cmpLastRevision": f"{value:+.1f}%",
            }
        }

    return vals

In [None]:
def merge_metrics(new):
    old = read_json("src/public/init.json")
    
    for proj, metrics in new.items():
        for metric, values in metrics.items():
            # print(f"{proj=}, {metric=}, {values=}")
            for name, value in values.items():
                if proj not in old["projectData"]:
                    print(f"{proj} not in old")
                    continue
                old["projectData"][proj][metric][name] = value
    write_object_to_file(old, "src/public/init.json")

In [None]:
new = get_metrics_sloc_linux()

In [None]:
merge_metrics(new)

In [None]:
output_directory = "output-busybox"
df_kconfig = read_dataframe('kconfig')
df_kconfig = df_kconfig[df_kconfig["system"] =="busybox"]
def get_metrics_sloc_nonLinux(project):
    vals = dict()
    df_arch = by_revision(df_kconfig)
    lastTwo = df_arch.sort_values(by="committer_date_unix").tail(2)["revision"]
    print(lastTwo)
    last_rev = lastTwo.iloc[1]
    before_last_rev = lastTwo.iloc[0]
    sloc = int(df_arch[df_arch["revision"]==last_rev]["source_lines_of_code"].iloc[0])
    before_last = int(df_arch[df_arch["revision"]==before_last_rev]["source_lines_of_code"].iloc[0])
    print(sloc, before_last)
    value = round(100 * (sloc - before_last) / before_last, 2)
    vals[project] = {
        "source_lines_of_code": {
            "currentValue": f"{sloc} loc",
            "cmpLastRevision": f"{value:+.1f}%",
        }
    }

    return vals

In [None]:
x = get_metrics_sloc_nonLinux("busybox")


In [None]:

merge_metrics(x)