# Plan
1. convert decimal addresses in the `.counts` files into hexadecimal (which matches the `.dump` files)
2. sort the rows in the `.counts` files according to the address
3. match the addresses from the `.counts` files to the `.dump` files, and get the instruction which corresponds to that address
4. output to a new csv, and do some data analysis

In [56]:
import os

dfs = {}

path = '../'

count_files = [i for i in os.listdir(path) if i.endswith('.counts')]
file_names = [i.split(".")[0] for i in count_files] # the files are in the same order

print(f"count_files={count_files}")
print(f"file_names={file_names}")

count_files=['bzip2.counts', 'omnetpp.counts', 'sjeng.counts', 'hmmer.counts', 'h264ref.counts', 'astar.counts', 'libquantum.counts', 'xalancbmk.counts', 'mcf.counts', 'gobmk.counts']
file_names=['bzip2', 'omnetpp', 'sjeng', 'hmmer', 'h264ref', 'astar', 'libquantum', 'xalancbmk', 'mcf', 'gobmk']


In [57]:
# For each count file, load them into a dataframe
import pandas as pd

for file in count_files:
    file_name = file.split(".")[0]
    df = pd.read_csv(os.path.join(path, file))
    dfs[file_name] = {
        "count": df
    }

In [58]:
dfs.keys()

dict_keys(['bzip2', 'omnetpp', 'sjeng', 'hmmer', 'h264ref', 'astar', 'libquantum', 'xalancbmk', 'mcf', 'gobmk'])

In [59]:
# convert decimal addresses into hex, and sort the rows by address
for df in dfs.values():
    count_df = df["count"]
    count_df.sort_values(by=['addr'], ascending=True, inplace=True)
    count_df['addr'] = count_df['addr'].apply(hex)

In [60]:
list(dfs.values())[0]["count"]

Unnamed: 0,addr,count,path,line,symbol
19010,0x3b770,1,/local/scratch/mc2262/cheribsd/lib/csu/riscv64...,76,_start
19029,0x3b774,2,/local/scratch/mc2262/cheribsd/lib/csu/riscv64...,89,_start
19045,0x3b778,1,/local/scratch/mc2262/cheribsd/lib/csu/riscv64...,89,_start
19070,0x3b77c,1,/local/scratch/mc2262/cheribsd/lib/csu/riscv64...,89,_start
18602,0x3b780,1,/local/scratch/mc2262/cheribsd/lib/csu/riscv64...,89,_start
...,...,...,...,...,...
21177,0xc8436,18,/local/scratch/mc2262/cheribsd/lib/libc/string...,52,strncmp
7864,0xc8438,2,/local/scratch/mc2262/cheribsd/lib/libc/string...,52,strncmp
7332,0xc843a,2,/local/scratch/mc2262/cheribsd/lib/libc/string...,54,strncmp
7873,0xc843e,14,/local/scratch/mc2262/cheribsd/lib/libc/string...,48,strncmp


In [91]:
# convert the dump files into dataframes
# we only have the dataframes of the files that have both a .dump and a .counts file
import re

dump_file_pattern = re.compile(r"(\d+)\.([a-zA-Z0-9_]+)\.dump")
dump_files = []

for file_name in os.listdir(path):
    match = re.match(dump_file_pattern, file_name)
    if match:
        file_name_text = match.group(2)
        if file_name_text in file_names:
            dump_files.append(os.path.join(path, file_name))

for file_path in dump_files:
    with open(file_path, 'r') as file:
        data = file.read()
    
    file_name = file_path.split(".")[-2]
    label_pattern = re.compile(r"\s*[0-9a-f]+\s+\<(.+)\>\:\s*")
    pattern = re.compile(r"\s*([0-9a-f]+):\s+((?:[0-9a-f]{2}\s+)+)\s+([\w.<>]+)\s*(.*)")
    rows = []
    
    label = ""

    for line in data.strip().splitlines():
        if not line.strip(): # skip empty lines
            continue

        label_match = re.match(label_pattern, line)
        if label_match:
            label = label_match.group(1)
            continue
        
        match = re.match(pattern, line)
        if match:
            address = hex(int(match.group(1), 16))
            instruction_bytes = match.group(2)
            assembly_instruction = match.group(3)
            operands = match.group(4).strip().replace(",", "")
            rows.append([address, instruction_bytes, label, assembly_instruction, operands])
            label = ""    
        else:
            print(f"{line} does not match")
    
    if "count" in dfs[file_name]:
        dfs[file_name]["dump"] = pd.DataFrame(rows, columns=['addr', 'bytes', 'label', 'instruction', 'operands'])
    else:
        print(f"{file_name} does not have a count file")

429.mcf:	file format elf64-littleriscv does not match
Disassembly of section .text: does not match
		... does not match
		... does not match
		... does not match
		... does not match
		... does not match
		... does not match
473.astar:	file format elf64-littleriscv does not match
Disassembly of section .text: does not match
		... does not match
		... does not match
		... does not match
		... does not match
		... does not match
		... does not match
/net/archive/export/cheri_traces/spec_static/spec-riscv64-purecap/445.gobmk/445.gobmk:	file format elf64-littleriscv does not match
Disassembly of section .text: does not match
		... does not match
		... does not match
		... does not match
		... does not match
		... does not match
464.h264ref:	file format elf64-littleriscv does not match
Disassembly of section .text: does not match
		... does not match
		... does not match
		... does not match
		... does not match
		... does not match
/net/archive/export/cheri_traces/spec_static/spec-riscv64-

In [92]:
list(dfs.values())[0]["dump"]

Unnamed: 0,addr,bytes,label,instruction,operands
0,0x3b770,5b 11 01 f4,_start,cincoffset,csp csp -192
1,0x3b774,23 48 11 0a,,csc,cra 176(csp)
2,0x3b778,23 40 81 0a,,csc,cs0 160(csp)
3,0x3b77c,23 48 91 08,,csc,cs1 144(csp)
4,0x3b780,23 40 21 09,,csc,cs2 128(csp)
...,...,...,...,...,...
160566,0xc8656,0f 24 01 00,,clc,cs0 0(csp)
160567,0xc865a,8f 20 01 01,,clc,cra 16(csp)
160568,0xc865e,5b 11 01 02,,cincoffset,csp csp 32
160569,0xc8662,67 80 00 00,,cjalr,cnull 0(cra)


In [93]:
gobmk_dict = dfs["gobmk"]
print(gobmk_dict.keys())
print(gobmk_dict["count"].keys())
print(gobmk_dict["dump"].keys())

dict_keys(['count', 'dump', 'merged'])
Index(['addr', 'count', 'path', 'line', 'symbol'], dtype='object')
Index(['addr', 'bytes', 'label', 'instruction', 'operands'], dtype='object')


In [94]:
# merge the dataframes
for df_name, df_dict in dfs.items():
    print(df_name)
    count_df = df_dict["count"]
    dump_df = df_dict["dump"]
    df_dict["merged"] = pd.merge(count_df[['addr', 'count']], dump_df[['addr', 'label', 'instruction', 'operands']], on='addr', how="inner")

bzip2
omnetpp
sjeng
hmmer
h264ref
astar
libquantum
xalancbmk
mcf
gobmk


In [95]:
dfs["bzip2"]["merged"]

Unnamed: 0,addr,count,label,instruction,operands
0,0x3b770,1,_start,cincoffset,csp csp -192
1,0x3b774,2,,csc,cra 176(csp)
2,0x3b778,1,,csc,cs0 160(csp)
3,0x3b77c,1,,csc,cs1 144(csp)
4,0x3b780,1,,csc,cs2 128(csp)
...,...,...,...,...,...
26686,0xc8436,18,,c.bnez,a2 0xc841e <strncmp+0x2>
26687,0xc8438,2,,c.li,a0 0
26688,0xc843a,2,,cjalr,cnull 0(cra)
26689,0xc843e,14,,sub,a0 a3 a4


In [96]:
for file_name, df_dict in dfs.items():
    df_dict["merged"].to_csv(f"../cleaned data/{file_name}.csv", index=False)

In [97]:
# calculate total lines of code
total = 0
for df_dict in dfs.values():
    total += df_dict["merged"]["count"].sum()
print(f"total number of instructions: {total:,}")

total number of instructions: 271,036,826,384
