In [None]:
import re
import json
#from copy import copy      # used to save plots. watch your memory.
#from pprint import pprint  # pretty iterables

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn import linear_model
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score

plt.rcParams['axes.labelsize']  = 20
plt.rcParams['axes.titlesize']  = 26
plt.rcParams['font.family']='monospace'
plt.rcParams['font.size']       = 22
plt.rcParams['legend.fontsize'] = 22
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['lines.linewidth'] = 4

In [None]:
# advanced gambs, don't touch it
def pretty(ss):
    s = ss[:]
    trash = r'[\_,\.,\,,\-,_,@,\:,\(,\)]+'
    s = ' '.join(re.sub(trash, ' ', s).split())
    return s.lower().replace('instcount','').replace('edgecounter','').replace('varcounter','').replace('using getnumuses','').strip().title()

# advanced gambs, don't touch it
suite_names = {
    'tf'                    : 'LLVM Test Suite (~240)',
    'AnghaBench_WholeFiles' : 'Angha Whole Files (~15K)',
    'largest10'             : 'Angha (Largest 10K)',
    'ldrgen_10K'            : 'Ldrgen (Largest 10K)',
    'CSmith_10K'            : 'CSmith (Largest 10K)',
    'deepsmith_kernels'     : 'Deepsmith Kernels (~1K)',
    'all_kernels'           : 'All Angha Kernels (~530K)',
}

In [None]:
try:
    df = pd.read_pickle('data/stats.pkl.bz2', compression='bz2')
    
except FileNotFoundError:
    # json load - named df to save memory, the file might be huge
    with open("data/stats.json", "r+") as finput:
        df = json.load(finput)

    # remove punctuation
    trash = r'[\_,\.,\,,\-,_,@,\:,\(,\)]+'
    df = {(re.sub(r'.*/', '', x['suitename']), re.sub(trash, '_', x['benchname'])) : {re.sub('\_$', '', re.sub(trash, '_', k.lower())): v for k, v in x['benchdata'].items()} for x in df}
    
    # to dataframe
    df = pd.DataFrame.from_dict(df, orient='index').fillna(0).astype(np.int32)
    
    # get that pickle, save that pickle, wrap wrap bz2 bz2
    df.to_pickle('data/stats.pkl.bz2', compression='bz2')
    
finally:
    
    df = df.rename(index=suite_names)
    
    # suite names
    suites = df.index.unique(0)

    # x, y
    plot_features = [
        ('instcount_number_of_load_insts', 'instcount_number_of_store_insts'),
        ('instcount_number_of_instructions_of_all_types', 'instcount_number_of_basic_blocks'),
        ('varcounter_number_of_variable_uses_using_getnumuses', 'varcounter_number_of_named_variables'),
        ('edgecounter_number_of_edges', 'instcount_number_of_basic_blocks'),
    ]
    
    # ratios
    ratios = []
    ftsep = ' /<br>'
    for ftx, fty in plot_features:
        df[fty + ftsep + ftx] = (df[fty] / (df[ftx] + 0.1))
        ratios.append(fty + ftsep + ftx)

    # {'suite1': {'X1,Y1': {'stat1': 5, 'stat2': 10}, 'X2,Y2': {'stat1': 5, 'stat2': 10}}, 'suite2':{{..}..}..}
    stats = {x: {sx + '_x_' + sy: {} for sx, sy in plot_features} for x in suites if x != suite_names['tf'] }

In [None]:
print(df.shape)
df.head(5)

<br><div style="border:none;width:100%;height:4px;color:#000;background-color:#000;"></div>
# <center>Linear Regression - Stats</center>

In [None]:
stats = {x: {fty + ftsep + ftx: {} for ftx, fty in plot_features} for x in suites if x != suite_names['tf'] }
# tf regression
tf_regr = {(t[0], t[1]) : {} for t in plot_features}
for ftx, fty in plot_features:
    fx = np.array(df.loc[suite_names['tf']][ftx]).reshape(-1, 1)
    fy = np.array(df.loc[suite_names['tf']][fty])
    regr = linear_model.LinearRegression()
    regr.fit(fx, fy)
    tf_regr[(ftx, fty)]['coef'] = regr.coef_
    tf_regr[(ftx, fty)]['intercept'] = regr.intercept_

# suites' regressions
for suite in [s for s in suites if s != suite_names['tf']]:
    for ftx, fty in plot_features:

        if fty + ftsep + ftx not in stats[suite]:
            stats[suite][fty + ftsep + ftx] = {}

        fx = np.array(df.loc[suite][ftx]).reshape(-1, 1)
        fy = np.array(df.loc[suite][fty])

        ## suite linear regression
        regr = linear_model.LinearRegression()
        regr.fit(fx, fy)
        y_reg = fx * regr.coef_ + regr.intercept_  # y = ax + b

        ## tf regression on suite's X for reference
        y_tf_reg = fx * tf_regr[(ftx, fty)]['coef'] + tf_regr[(ftx, fty)]['intercept']
        
        stats[suite][fty + ftsep + ftx]['mse_self']   = mean_squared_error(fy, y_reg)
        stats[suite][fty + ftsep + ftx]['mae_self']   = mean_absolute_error(fy, y_reg)
        stats[suite][fty + ftsep + ftx]['evs_self']   = explained_variance_score(fy, y_reg)
        stats[suite][fty + ftsep + ftx]['r2s_self']   = r2_score(fy, y_reg)
        stats[suite][fty + ftsep + ftx]['mseRegr_tf'] = mean_squared_error(y_reg, y_tf_reg)
        stats[suite][fty + ftsep + ftx]['maeRegr_tf'] = mean_absolute_error(y_reg, y_tf_reg)
        stats[suite][fty + ftsep + ftx]['evsRegr_tf'] = explained_variance_score(y_reg, y_tf_reg)
        stats[suite][fty + ftsep + ftx]['r2sRegr_tf'] = r2_score(y_reg, y_tf_reg)
        stats[suite][fty + ftsep + ftx]['mseTrue_tf'] = mean_squared_error(fy, y_tf_reg)
        stats[suite][fty + ftsep + ftx]['maeTrue_tf'] = mean_absolute_error(fy, y_tf_reg)
        stats[suite][fty + ftsep + ftx]['evsTrue_tf'] = explained_variance_score(fy, y_tf_reg)
        stats[suite][fty + ftsep + ftx]['r2sTrue_tf'] = r2_score(fy, y_tf_reg)

# convert stats to table
if gen_table := False :
    midx  = pd.MultiIndex.from_tuples([(suite, title) for suite, v in stats.items() for title in v.keys()])
    table = [dt for v in stats.values() for dt in v.values()]

    tdf   = pd.DataFrame(data=table, index=midx)
    table = table.to_html()  # print(stats_html)

<div style="background-color: rgba(141, 215, 145, 0.5);">
    <br><div style="border:none;width:100%;height:4px;color:#000;background-color:#000;"></div>
    <h1><center>Ratio plots</center></h1>
</div>

In [None]:
colors = ['rgb(127, 161, 255)', 'rgb(255, 101, 101)', 'rgb(141, 215, 145)',
          'rgb(255, 188, 66)', 'rgb(226, 135, 223)', 'rgb(143, 89, 50)', 'rgb(247, 243, 114)']

boxdata = {'x': {}, 'y': {}}
for suite in [s for s in suites if (s != suite_names['all_kernels'])]:
    boxdata['x'][suite] = []
    boxdata['y'][suite] = []
    for idx, ft in enumerate(ratios):
        bot = df.loc[suite][ft].quantile(0.05)
        top = df.loc[suite][ft].quantile(0.95)
        fdf = df.loc[suite][(df.loc[suite][ft] >= bot) & (df.loc[suite][ft] <= top)].drop(columns=[c for c in df.columns if c not in ratios])
        boxdata['x'][suite] += [[pretty(ft).replace('/','<br>÷'), idx] for i in range(len(fdf))]
        boxdata['y'][suite] += list(fdf[ft])

fig = go.Figure()

for idx, suite in enumerate([s for s in suites if (s != suite_names['all_kernels'] and s != suite_names['AnghaBench_WholeFiles'] and s != suite_names['deepsmith_kernels'])]):
    fig.add_trace(go.Box(
        boxpoints=False,
        y=boxdata['y'][suite],
        x=[_[0] for _ in boxdata['x'][suite]],
        name=suite,
        marker_color=colors[idx],
        #line_width=0,
    ))

fig.update_layout(
    title={
        'text': 'Ratios',
        'y':0.9,
        'x':0.42,
        'xanchor': 'center',
        'yanchor': 'top'},
    boxmode='group', # group together boxes of the different traces for each value of x
    #yaxis_range=[-0.25,2],
    width=1024,
    height=768,
    #yaxis_type='log',
    #yaxis_autorange=True,
)
fig.show()

<br><div style="border:none;width:100%;height:4px;color:#000;background-color:#000;"></div>
# <center>Program distance</center>

<br><div style="border:none;width:100%;height:4px;color:#000;background-color:#000;"></div>
# <center>Linear Regression - Plots</center>

<br><div style="border:none;width:100%;height:4px;color:#000;background-color:#000;"></div>
# <center>Linear Regression - Plots</center>

<br><div style="border:none;width:100%;height:4px;color:#000;background-color:#000;"></div>
# <center>Linear Regression - Plots</center>

In [None]:
plt.rcParams['figure.figsize']  = (12, 8)
plt.rcParams['font.size']       = 22

plt.subplots_adjust(hspace=0.5)
cnt = 0
for suite in df.index.unique(0):
    for sx, sy in plot_features:
        
        fx = np.array(df.loc[suite][sx]).reshape(-1, 1)
        fy = np.array(df.loc[suite][sy])
        
        ax = plt.subplot()
        
        # scatter
        ax.plot(fx, fy, '.r', lw=0.1)

        # diagonal
        x = np.linspace(0, np.max(fx))
        y = x
        ax.plot(x, y, '-k', lw=0.5)

        # suite linear regression
        regr = linear_model.LinearRegression()
        regr.fit(fx, fy)
        fx = np.append(np.array([[0]]), fx)    # 0 avoids a cropped line starting late
        y = fx * regr.coef_ + regr.intercept_  # y = ax + b
        ax.plot(fx, y, '-b', lw=1, label='{}'.format(suite))

        # tf regression for reference
        y = fx * tf_regr[(sx, sy)]['coef'] + tf_regr[(sx, sy)]['intercept']
        tf_reg_plot = ax.plot(fx, y, '-m', lw=1, label=suite_names['tf'])
        
        # line legends. without a title, this is what identifies the plots
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[::-1], labels[::-1])
        
        # line info (slope, angle, etc)
        ax.text(0.01, 1.05, 'Suite ∠ x: {: >6.2f}\nSuite ∠ tf: {: >6.2f}'.format( np.arctan(regr.coef_[0])*(180/np.pi), np.arctan(tf_regr[(sx, sy)]['coef'][0] - regr.coef_[0])*(180/np.pi)), ha='left', transform=ax.transAxes)
        ax.text(0.99, 1.05, 'Suite slope: {: >6.3f}\ntf slope: {: >6.3f}'.format( regr.coef_[0], tf_regr[(sx, sy)]['coef'][0] ), ha='right', transform=ax.transAxes)

        # pretty labels
        ax.set_xlabel(re.sub(r' \(', '\n(', re.sub('_', ' ', re.sub('.*::', '', sx))).title())
        ax.set_ylabel(re.sub(r' \(', '\n(', re.sub('_', ' ', re.sub('.*::', '', sy))).title())
        
        cnt += 1  # need this for subplot management and (if desired) naming plots
        
        plt.savefig('data/img/{}_{}.png'.format(suite, (cnt % 4) + 1), format='png')  # individual figures
        plt.close()  # individual figures
        
print('done')

<br><div style="border:none;width:100%;height:4px;color:#000;background-color:#000;"></div>
# <center> T-test </center>

In [None]:
# X_TF = list of every ratio "number of variables"/"number of uses" per program in TF.
# X_An = same thing, but for Angha.

In [None]:
### mad science tests
## class Point2d:
##     def __init__(self, vx=0, vy=0):
##         self.x = vx
##         self.y = vy
##     def __add__(self, other):
##         return Point2d(self.x + other.x, self.y + other.y)
##     def __str__(self):
##         return '({}, {})'.format(self.x, self.y)

<br><div style="border:none;width:100%;height:4px;color:#000;background-color:#000;"></div>
# <center> Median/Quantile feature plots </center>

In [None]:
##------------------ Median/Quantile ------------------#
## Wanna implement it so that the plots make more sense.
## Currently the outliers stretch the axis' too much.
##------------------ Unfinished work ------------------#

#q25, q75 = df['instcount::Number_of_instructions_(of_all_types)'].quantile([0.25, 0.75])

#quantile_pct=1
#qntl = 0.5
#suite = suite_names['tf']
#colname = 'instcount::Number_of_Load_insts'
#df.loc[suite][df.loc[suite][colname] <= df.loc[suite][colname].quantile(qntl)][colname]

##df_med = df.loc[suite][df.loc[suite][sx] <= df.loc[suite][sx].quantile(quantile_pct)]
##fxm = np.array(df_med[sx]).reshape(-1, 1)
##fym = np.array(df_med[sy])
#
#fx = np.array(df.loc[suite][sx]).reshape(-1, 1)
#fy = np.array(df.loc[suite][sy])
#
#plt.xlabel(sx)
#plt.ylabel(sy)
#
## scatter
#plt.plot(fx, fy, '.r', lw=0.1)  # plt.plot(fxm, fym, '.r', lw=0.1)
#
## diagonal
#x = np.linspace(0, np.max(fx))  # x = np.linspace(0, np.max(fxm))
#y = x
#plt.plot(x, y, '-k', lw=0.5)