In [1]:
# Standard libraries
import os
from pathlib import Path
from multiprocessing import Pool
from pprint import pprint

# External
import numpy as np
import scipy.stats as stats
import pandas as pd
import yaml
from IPython.display import display

# Misc
lineSepThin  = ('# --------------------------------------------------------------------------- #')
lineSepThick = ('# =========================================================================== #')
pd.options.display.float_format = '{:20,.2f}'.format

<hr>

In [2]:
# Path to the directory with the '.c' files and '.d' dirs
benchmarkDir = '../tmp/seed_fns'

# Patterns to look for in desired files
patterns = {}
patterns['all_O0'] = '*.d/*_O0.info'
patterns['case0_O1'] = '*.d/*_case0_O1.info'
patterns['case0_Oz'] = '*.d/*_case0_Oz.info'

print(f'Globbing files from directory\n\n{benchmarkDir}\n\nwith group -> pattern:\n')
_ = [print(f'{group}    ->    {pat}') for group, pat in patterns.items()]

Globbing files from directory

../tmp/seed_fns

with group -> pattern:

all_O0    ->    *.d/*_O0.info
case0_O1    ->    *.d/*_case0_O1.info
case0_Oz    ->    *.d/*_case0_Oz.info


In [3]:
infoFilePaths = {group: list(Path(benchmarkDir).glob(pattern)) for group, pattern in patterns.items()}

for group, pattern in patterns.items():
    print(f'Group {group}:\t\tfound {len(infoFilePaths[group])} programs')

Group all_O0:		found 50688 programs
Group case0_O1:		found 0 programs
Group case0_Oz:		found 0 programs


In [4]:
runInParallel = True  # Run in parallel?

# Number of cpu cores (remove the `// 2` to use all the cores)
nproc = len(os.sched_getaffinity(0)) // 2 if runInParallel else 1
print(f'Using {nproc} core(s)')

chunksize = 512
print(f'Each cpu core will work on chunks of {chunksize} tasks')

Using 12 core(s)
Each cpu core will work on chunks of 512 tasks


<hr>

In [5]:
# Loads .info files faster if you have parallel disk access
def parseInfo(infoFilePath):
    try:
        with open(infoFilePath, 'r') as infoFileHandle:
            try:
                info = yaml.safe_load(infoFileHandle)
            except Exception as e:
                print(f'Failed parsing {infoFilePath}: {e}')
                return None
            else:
                if not info:
                    print(f'Info evaluates to False: {infoFilePath}')
                    return None
                return info[0]
                
    except Exception as e:
        print(f'Failed parsing {infoFilePath}: {e}')
        return None


# Turns each key in the inner dicts ('static' and 'dynamic') into a key in
# the outer dict, e.g., accessing infoFile['static']['instructions'] becomes
# infoFile['static_instructions']. This also makes pandas happier.
def flattenCfgInfo(cfgInfo, desiredCols=None):
    
    try: fullName = cfgInfo['name']
    except Exception as e:
        print(f'File without the field "name"! {e}\nContents:\n{cfgInfo}\n')
        return None # Failure
    
    try: res = {k: v for k, v in {
            'cfg': cfgInfo['cfg'],
            'invoked': cfgInfo['invoked'],
            'complete': cfgInfo['complete'],
            'blocks': cfgInfo['blocks'],
            'phantoms': cfgInfo['phantoms'],
            'exit': cfgInfo['exit'],
            'halt': cfgInfo['halt'],
            'edges': cfgInfo['edges'],
            'static_instructions': cfgInfo['static']['instructions'],
            'static_calls': cfgInfo['static']['calls'],
            'static_signals': cfgInfo['static']['signals'],
            'dynamic_instructions': cfgInfo['dynamic']['instructions'],
            'dynamic_calls': cfgInfo['dynamic']['calls'],
            'dynamic_signals': cfgInfo['dynamic']['signals'],
            'name': Path(Path(cfgInfo['name']).parent.name).with_suffix('.c'),
    }.items() if k in desiredCols}
    except Exception as e:
        print(f'Function flattenCfgInfo(cfgInfo, desiredCols) failed for {fullName}: {e}')
        return None # Failure
    else:
        return res  # Success

In [6]:
desiredCols = ['name', 'static_instructions', 'dynamic_instructions']

print('The desired columns to be included in the dataframe are:')
pd.DataFrame([], columns=desiredCols).style.set_table_styles([dict(selector="th", props=[('font-size', '18px')])])

The desired columns to be included in the dataframe are:


Unnamed: 0,name,static_instructions,dynamic_instructions


In [7]:
tmpdf = {}
infoGroups = {}
with Pool(nproc) as pool:
    for group, files in infoFilePaths.items():
        res = pool.imap_unordered(parseInfo, files, chunksize)
        infoGroups[group] = [flattenCfgInfo(r, desiredCols) for r in res if r]
        tmpdf[group] = pd.DataFrame(infoGroups[group], columns=desiredCols)
    pool.close()
    pool.join()

Info evaluates to False: ../tmp/seed_fns/extr_emscriptensystemliblibcmuslsrcmathcopysign.c_copysign.d/extr_emscriptensystemliblibcmuslsrcmathcopysign.c_copysign_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_linuxfsbtrfsinode.c_btrfs_readpage_io_failed_hook.d/extr_linuxfsbtrfsinode.c_btrfs_readpage_io_failed_hook_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_How-to-Make-a-Computer-Operating-Systemsrcsdksrclibcsrcstdlibllabs.c_llabs.d/extr_How-to-Make-a-Computer-Operating-Systemsrcsdksrclibcsrcstdlibllabs.c_llabs_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_linuxarchpariscbootcompressedmisc.c_strlen.d/extr_linuxarchpariscbootcompressedmisc.c_strlen_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_micropythonliblibm_dblrint.c_rint.d/extr_micropythonliblibm_dblrint.c_rint_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_How-to-Make-a-Computer-Operating-Systemsrcsdksrclibcsrcstringstrncmp.c_strncmp.d/extr_How-to-Make

Info evaluates to False: ../tmp/seed_fns/extr_linuxarchx86bootcompressed..string.c_strcmp.d/extr_linuxarchx86bootcompressed..string.c_strcmp_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_How-to-Make-a-Computer-Operating-Systemsrcsdksrclibcsrcstdlibabs.c_abs.d/extr_How-to-Make-a-Computer-Operating-Systemsrcsdksrclibcsrcstdlibabs.c_abs_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_linuxarchx86bootcompressed..string.c_strlen.d/extr_linuxarchx86bootcompressed..string.c_strlen_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_linuxdriversfirmwareefilibstubstring.c_strncmp.d/extr_linuxdriversfirmwareefilibstubstring.c_strncmp_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_linuxlibstring.c_strcmp.d/extr_linuxlibstring.c_strcmp_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_emscriptentestscoretest_simd2.c__mm_add_ps.d/extr_emscriptentestscoretest_simd2.c__mm_add_ps_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns

Info evaluates to False: ../tmp/seed_fns/extr_linuxarchx86bootstring.c_strncmp.d/extr_linuxarchx86bootstring.c_strncmp_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_linuxlibstring.c_strchr.d/extr_linuxlibstring.c_strchr_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_darwin-xnuosfmkkernsched_dualq.c_dualq_main_runq.d/extr_darwin-xnuosfmkkernsched_dualq.c_dualq_main_runq_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_linuxarchx86bootstring.c_strcmp.d/extr_linuxarchx86bootstring.c_strcmp_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_How-to-Make-a-Computer-Operating-Systemsrcsdksrclibcsrcctypeisprint.c___ctype_b_loc.d/extr_How-to-Make-a-Computer-Operating-Systemsrcsdksrclibcsrcctypeisprint.c___ctype_b_loc_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_linuxlibstring.c_strcspn.d/extr_linuxlibstring.c_strcspn_big-arr_O0.info
Info evaluates to False: ../tmp/seed_fns/extr_emscriptensystemliblibcmuslsrcctypeisprint.c_i

In [8]:
print()
print(lineSepThick)

df = {}
for group, infoFiles in infoGroups.items():
    df[group] = tmpdf[group].set_index(['name'], verify_integrity=True)
    if not len(df[group]): continue
    print(f'\n{" "*((80-len(group))//2)}{group}:')
    display(df[group].head(5))
    print()
    print(df[group].dtypes)
    print()
    print(df[group].describe(include='all'))
    print()
    print(lineSepThin)



                                     all_O0:


Unnamed: 0_level_0,static_instructions,dynamic_instructions
name,Unnamed: 1_level_1,Unnamed: 2_level_1
extr_linuxnetnetfilternft_set_rbtree.c_nft_rbtree_estimate.c,25,25
extr_linuxcryptoecc.c_vli_test_bit.c,17,17
extr_radare2librasmarchxtensagnuxtensa-modules.c_Opcode_xsr_excsave4_Slot_inst_encode.c,7,7
extr_linuxdriversinfinibandhwhfi1sdma.h_sdma_mapping_len.c,11,11
extr_reactosdllwin32dbghelpstorage.c_hash_table_hash.c,42,387



static_instructions     int64
dynamic_instructions    int64
dtype: object

       static_instructions  dynamic_instructions
count            50,584.00             50,584.00
mean                 16.22             47,242.59
std                  25.01          3,855,927.38
min                   5.00                  5.00
25%                   7.00                  7.00
50%                  11.00                 11.00
75%                  17.00                 17.00
max               1,366.00        399,321,603.00

# --------------------------------------------------------------------------- #


In [9]:
# Make sure the path leading to the last prefix directory exists
outputPrefix = 'output/cfgInfo'

# Warning: [ , . ; : ] are all allowed characters for linux filenames
csvSeparator = ';'

for group in patterns.keys():
    try:
        df[group].to_csv(Path(outputPrefix + f'_{group}.csv'), sep=csvSeparator, encoding='utf-8')
    except Exception as e:
        print(f'{e}')

<hr>
<h1><center>WIP</center></h1>

In [None]:
## Todo: quote the field 'name' inside flattenCfgInfo
# Whether to properly quote the 'name' field. Useful for working on shells.
# WARNING: as the docs state:
# '''
#     The shlex module is only designed for Unix shells.
# '''
shellQuoteFileName = True
if shellQuoteFileName: import shlex

In [None]:
# --------------------------------------------------------------------------- #

# Instead of running Shapiro on everything, does this make sense?
#
#  shapiro_result_for_sample = []
#  for _ in range(num_of_samples):
#     statSample = np.random.choice(df[stat], size=2048, replace=False)
#     shapiro_result_for_sample += [scipy.stats.shapiro(statSample)]
#
#  """deviation, variance, median, etc"""
#  printStats(shapiro_result_for_sample)

# scipy.stats.shapiro probably does something like that internally, as
# it is common with statistic tests, but running it straight away on a
# dataset of size > 5000 issues the warning:
#
#     UserWarning: p-value may not be accurate for N > 5000.
#
# To reproduce the warning, just run stats.shapiro(df[column]) for the
# desired df and stat, e.g., stats.shapiro(df_case0_O1['dyn'])
#
# I chose to register this because I'm quite skeptical that avoiding
# the warning through multiple tests-on-smaller-samples might compromise
# the validity of the test. I'm no statistician ¯\_(ツ)_/¯


# One sample only
dynSample    = np.random.choice(df['dyn'], size=2048, replace=False)
staticSample = np.random.choice(df['static'], size=2048, replace=False)

shapiroStaticSample = stats.shapiro(staticSample)
shapiroDynSample = stats.shapiro(dynSample)
print(shapiroStatic[:10])
print(shapiroDyn[:10])


# Full dataframe
shapiroStatic = stats.shapiro(df['static'])
shapiroDyn = stats.shapiro(df['dyn'])
print(shapiro_dyn)