# Scheduling benchmarks analysis

## Library

In [None]:
import matplotlib.pyplot as plt
from typing import Optional, List, Tuple, Dict
import pandas as pd
import os
import re
import json
import subprocess
import numpy as np
from scipy import stats
from collections import defaultdict
from sklearn.cluster import DBSCAN

#### Global variables

In [None]:
TRACE_CMD_CACHE_FILENAME = "trace_cmd_runtimes"
RESULTS_DIR_PATH = "../results"
ABSOLUTE_RESULTS_DIR_PATH = "/home/cgachod/analysis/results"

PERF_SCRIPT_RESULTS_FILEPATH = "assets/.perf_mem_results.log"

NODE_1_PHYS_ADDR_START = 0x1840000000

DAHU_NODE_0_CPUID = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
DAHU_NODE_1_CPUID = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
dahu_cpu_nodes = [1 if cpuid in DAHU_NODE_1_CPUID else 0 for cpuid in range(64)]

def get_result_dir_path(result_dir_name):
    return os.path.join(RESULTS_DIR_PATH, result_dir_name)

def get_absolute_result_dir_path(result_dir_name) :
    return os.path.join(ABSOLUTE_RESULTS_DIR_PATH, result_dir_name)

#### Visualization params

In [None]:
nb_enabled_color = "tab:orange"
nb_disabled_color = "tab:blue"

figure_height = 10 # 10
min_figure_width = 40
figure_width_coeff = 0.3

line_width = 1.0

Idea is to represent where each core accesses memory
- Sort cores by node
- For each core, have a timeline or maybe just a percentage of time where it accesses which node
- Maybe we can have 2 histograms, one up which is local node and 1 down which is remote 

In [None]:
# So for that we only need cpu and memory address and time
# We assume we have the perf data file

# 1: pid, 2: tid, 3: cpuid, 4: time, 5: period, 6: event , 7: virtual address
# basic_info_regex_str = r"^ *[\w\/\-\.\: ]+ +(\d+) \[(\d+)\] (\d+\.\d+): +([\w\/\-\.]+).*?: +([0-9a-f]+)"
basic_info_regex_str = r"^ *[\w\/\-\.\:]+ +(\d+)\/(\d+) +\[(\d+)\] (\d+\.\d+): +(\d+) +([\w\/\-\.]+).*?: +([0-9a-f]+)"

# 8: memory access type, 9: TLB access type
data_src_regex_str = r"[0-9a-f]+ \|OP (?:LOAD|STORE)\|([^\|]+)\|[^\|]+\|(TLB [^\|]+)\|[^\|]+\|[a-zA-Z\/\- ]+"
phys_addr_regex_str = r"([0-9a-f]+)"
# data_src_regex = re.compile()

line_regex = re.compile(basic_info_regex_str + r"\s+" + data_src_regex_str + phys_addr_regex_str)

def generate_perf_mem_log(file_path: str) -> dict :
    command_str = f"perf script -i {file_path} -c cg.C.x -F 'comm,pid,tid,cpu,time,period,event,addr,data_src,phys_addr' > {PERF_SCRIPT_RESULTS_FILEPATH}"
    print(command_str)
    result = subprocess.run(
        command_str,
        shell=True,
        stdout = subprocess.PIPE,
        universal_newlines = True
    )
    
# perf script -L -F 'comm,pid,tid,cpu,time,addr,event,ip,phys_addr,data_src,period' --reltime --time 1.0,2.0

In [None]:
generate_perf_mem_log("assets/perf.data")

In [None]:
def build_memory_accesses_df(filepath: str) -> pd.DataFrame :
    timestamp = []
    cpuid = []
    virtual_addr = []
    physical_addr = []
    cache_result = []

    # filename = "assets/perf_mem_sample.log"
    # filename = "assets/.perf_mem_results.log"
    with open(filepath) as f :
        for line in f :
            matched = line_regex.match(line)
            if matched :
                timestamp.append(float(matched[4]))
                cpuid.append(int(matched[3]))
                cache_result.append(matched[8])
                virtual_addr.append(int(matched[7], base=16))
                physical_addr.append(int(matched[10], base=16))
                pass
            else :
                print("Not matched line : ", line)

    dahu_cpu_nodes_map = {cpuid: 1 if cpuid in DAHU_NODE_1_CPUID else 0 for cpuid in range(64)}
    # print(NODE_1_PHYS_ADDR_START)
    memory_df = pd.DataFrame({"time": timestamp, "cpuid": cpuid,  "virt": virtual_addr, "phys": physical_addr , "cache_result": cache_result})
    memory_df['cpu_node'] = memory_df['cpuid'].map(dahu_cpu_nodes_map)
    memory_df['memory_node'] = (memory_df['phys'] >= NODE_1_PHYS_ADDR_START).astype(int)
    return memory_df

In [None]:
memory_df = build_memory_accesses_df("assets/.perf_mem_results.log")

In [None]:
print(len(memory_df))
memory_df['cache_result'].value_counts()

memory_df.loc[memory_df['cache_result'] == "LVL L3 miss"]

In [None]:
mem_accesses_types = defaultdict(int)

timestamp = []
cpuid = []
virtual_addr = []
physical_addr = []
cache_result = []

filename = "assets/perf_mem_sample.log"
# filename = "assets/.perf_mem_results.log"
with open(filename) as f :
    for line in f :
        matched = line_regex.match(line)
        if matched :
            timestamp.append(float(matched[4]))
            cpuid.append(int(matched[3]))
            cache_result.append(matched[8])
            virtual_addr.append(int(matched[7], base=16))
            physical_addr.append(int(matched[10], base=16))
            # not_matched += 1
            # mem_accesses_types[matched[8]] += 1
            # print(matched[6])
            pass
        else :
            print("Not matched line : ", line)
        # node_accesses_per_cpu[int(matched[1])].append(( float(matched[2]), int(int(matched[3], base=16) >= 0x1840000000) ))
        # memory_accesses_per_cpu[int(matched[1])].append(int(matched[3], base=16), float(matched[2]))
        
# for key, value in sorted(mem_accesses_types.items(), key = lambda x : x[1], reverse=True) :
#     print(key.rjust(25), ":", value)

dahu_cpu_nodes_map = {cpuid: 1 if cpuid in DAHU_NODE_1_CPUID else 0 for cpuid in range(64)}
# print(dahu_cpu_nodes_map)
print(NODE_1_PHYS_ADDR_START)

memory_df = pd.DataFrame({"time": timestamp, "cpuid": cpuid,  "virt": virtual_addr, "phys": physical_addr , "cache_result": cache_result})
memory_df['cpu_node'] = memory_df['cpuid'].map(dahu_cpu_nodes_map)
memory_df['memory_node'] = (memory_df['phys'] >= NODE_1_PHYS_ADDR_START).astype(int)
# memory_df.loc[memory_df['memory_node'] == 1]

# 14053033756
# 104152956928

In [None]:
# memory_df.loc[memory_df['memory_node'] == 1]

memory_df['cache_result'].value_counts()

In [None]:
# We want to plot for each core :
# Time in X coordinate
# For starters, node in Y coordinate, and color for type of access

In [None]:
line = "            perf   78464 [034] 10439.195550:         cpu/mem-stores/P: fffffe67483a6e1c      1e05080144 |OP STORE|LVL L1 hit|SNP N/A|TLB N/A|LCK N/A|BLK  N/A                                     0"

In [None]:

regex = re.compile(r"^\[(\d+)\]\s+(\d+.\d+):\s+(\w+)")
# matches = [regex.match(x) for x in a]

class MemoryAccesses :
    def __init__(self) -> None:
        self.nodes = []
        self.times = []
        
    def append(self, physical_address: int, timestamp: float) :
        node = int(physical_address >= 0x1840000000)
        self.nodes.append(node)
        self.times.append(timestamp)
        
    def __repr__(self) -> str:
        return str(len(self.nodes)) + " nodes, " + str(len(self.times)) + " timestamps"

# memory_accesses_per_cpu = [MemoryAccesses() for _ in range(64)]
node_accesses_per_cpu = [[] for _ in range(64)]

    
not_matched = 0
with open(PERF_SCRIPT_RESULTS_FILEPATH) as f :
    for line in f :
        matched = line_regex.match(line)
        if not matched :
            not_matched += 1
            # print("Not matched line : ", line)
        # node_accesses_per_cpu[int(matched[1])].append(( float(matched[2]), int(int(matched[3], base=16) >= 0x1840000000) ))
        # memory_accesses_per_cpu[int(matched[1])].append(int(matched[3], base=16), float(matched[2]))
        
print(not_matched)

In [None]:
6021476 - 6012724

In [None]:
# for i, accesses in enumerate(node_accesses_per_cpu) :
#     print("CPU", i, ":")

min_time = min([min(accesses) for accesses in node_accesses_per_cpu])[0]
max_time = max([max(accesses) for accesses in node_accesses_per_cpu])[0]
duration = max_time - min_time
print(duration)

n_intervals = 1000
time_interval = duration / n_intervals
print(time_interval)



# pairs = sorted(node_accesses_per_cpu[0])
# duration = pairs[-1][1] - pairs[0][1]




In [None]:
dahu_node_0_cpuids = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
dahu_node_1_cpuids = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]

def finish_memory_graph(cpuid, local_node) :
    plt.minorticks_on()
    plt.grid(axis="x", which="major")
    plt.grid(axis="y", which="major")
    plt.grid(axis="y", which="minor", linestyle=':', linewidth=0.5)
    plt.gcf().set_size_inches(30, 8)
    plt.title(f"Memory access for CPU {cpuid}, local node is {local_node}")
    plt.show()
    
    
def plot_accesses_for_cpu(cpuid) :
    buckets = [
        [0 for i in range(n_intervals)],
        [0 for i in range(n_intervals)]
    ]

    for timestamp, node_id in node_accesses_per_cpu[cpuid] :
        bucket_idx = int((timestamp - min_time) / time_interval)
        # print(bucket_idx)
        buckets[node_id][bucket_idx] += 1
        
    local = int(cpuid % 2 != 0)
    remote = 1 - local
        
    x_values = [time_interval * i for i in range(n_intervals)]
    plt.plot(x_values, buckets[local])
    plt.plot(x_values, buckets[remote])
    finish_memory_graph(cpuid, local)
    
for cpuid in dahu_node_0_cpuids :
    plot_accesses_for_cpu(cpuid)
    
for cpuid in dahu_node_1_cpuids :
    plot_accesses_for_cpu(cpuid)
    

    

In [None]:
# Build aggregated graph : For agg node 0, local and remote accesses
# On same graph : for agg node 0, local and remote accesses

aggregated_buckets = [
    [
        # Node 0
        [0 for i in range(n_intervals)], # Accesses to node 0
        [0 for i in range(n_intervals)] # Accesses to node 1
    ],
    [
        # Node 1
        [0 for i in range(n_intervals)], # Accesses to node 0
        [0 for i in range(n_intervals)] # Accesses to node 1
    ]
]

def get_local_node(cpuid) :
    return int(cpuid % 2 != 0)

for cpuid, node_accesses in enumerate(node_accesses_per_cpu) :
    for timestamp, node_id in node_accesses :
        bucket_idx = int((timestamp - min_time) / time_interval)
        # print(bucket_idx)
        aggregated_buckets[get_local_node(cpuid)][node_id][bucket_idx] += 1
        
    local = int(cpuid % 2 != 0)
    remote = 1 - local
        
x_values = [time_interval * i for i in range(n_intervals)]

plt.plot(x_values, aggregated_buckets[0][0], label="Local accesses - Node 0")
plt.plot(x_values, aggregated_buckets[0][1], label="Remote accesses - Node 0")
plt.plot(x_values, aggregated_buckets[1][1], label="Local accesses - Node 1")
plt.plot(x_values, aggregated_buckets[1][0], label="Remote accesses - Node 1")
plt.minorticks_on()
plt.grid(axis="x", which="major")
plt.grid(axis="y", which="major")
plt.grid(axis="y", which="minor", linestyle=':', linewidth=0.5)
plt.gcf().set_size_inches(30, 8)
plt.title(f"Memory access for CPU {cpuid}, local node is {local}")
plt.legend(loc = "best")
plt.show()


In [None]:
int(0.017880841000000146 / time_interval)

In [None]:
regex = re.compile(r"^\[(\d+)\]\s+(\d+.\d+):\s+(\w+)")
matched = regex.match("[000] 10439.189717: 17e0a33288")
float(matched[2])

Filtering functions

In [None]:
hex(25427968)
1840000000
1aba068610

In [None]:
0x1840000000

# Temporary save

In [None]:
cpu_to_node = {}
matches = re.findall(
    r"NUMA +node\d+ +CPU\(s\): +([\d,]+)", 
    subprocess.run("lscpu", stdout = subprocess.PIPE, universal_newlines = True).stdout
)

node_id = 0
for cpulist in matches:
    for cpuid in cpulist.split(','):
        cpu_to_node[int(cpuid)] = node_id
    node_id += 1

print(cpu_to_node)

In [None]:
class PerfResultsReader:
    def __init__(self, log_file_path = in_working_dir(".perf.log")) -> None:
        self.log_file_path = log_file_path
        
    def __get_initial_timestamp(self, data_file_path: str) :
        res = subprocess.run(
            f"perf script -i {data_file_path} -F time | head -1",
            shell=True,
            stdout = subprocess.PIPE,
            universal_newlines = True
        )
        return float(res.stdout.strip(':\n '))
        
    # For example time_option="10%-20%"
    def __extract_perf_data_file_with_latency(self, data_file_path: str, executable: Optional[str], time_option: Optional[str]) -> float:
        initial_timestamp = self.__get_initial_timestamp(data_file_path)
        print(f"Retrieved initial timestamp : {initial_timestamp}")
        
        executable_filter = f"-c {executable}" if executable is not None else ""
        time_filter = f"--time {time_option}" if time_option is not None else ""
        command_str = f"perf script -i {data_file_path} -L {executable_filter} {time_filter} > {self.log_file_path}"
        print(f"Executing {command_str}")
        result = subprocess.run(
            command_str,
            shell=True,
            stdout = subprocess.PIPE,
            universal_newlines = True
        )
        return initial_timestamp
    
    def __parse_events_with_latency(self, filepath: str, initial_timestamp: float = 0.0) -> pd.DataFrame :
        # 1: pid, 2: cpuid, 3: timestamp, 4: period, 5: event, 6: virt_addr 
        basic_info_regex_lat_str = r"^ *\S+ +(\d+) +(\d+) +(\d+\.\d+): +(\d+) +(\S+): +([0-9a-f]+)"
        
        # 1: cache_result, 2: tlb_result, 3: latency, 4: phys_adress
        data_src_regex_lat_str = r"[0-9a-f]+ \|OP (?:LOAD|STORE)\|([^\|]+)\|[^\|]+\|(TLB [^\|]+)\|[^\|]+\|[a-zA-Z\/\- ]+(\d+) +\d+ +[0-9a-f]+.+ ([0-9a-f]+)"
        line_lat_regex = re.compile(basic_info_regex_lat_str + r" +" + data_src_regex_lat_str)
        
        # pid = []
        cpuid = []
        timestamp = []
        period = []
        event = []
        virtual_addr = []
        cache_result = []
        latency = []
        physical_addr = []

        with open(filepath) as f :
            for line in f :
                matched = line_lat_regex.match(line)
                if matched :
                    # pid.append(int(matched[1]))
                    cpuid.append(int(matched[2]))
                    timestamp.append(float(matched[3]))
                    period.append(int(matched[4]))
                    event.append(matched[5])
                    virtual_addr.append(int(matched[6], base=16))
                    cache_result.append(matched[7])
                    latency.append(int(matched[9]))
                    physical_addr.append(int(matched[10], base=16))
                    pass
                else :
                    print("Not matched line : ", line)
                    
        # dahu_cpu_nodes_map = {cpuid: 1 if cpuid in DAHU_NODE_1_CPUID else 0 for cpuid in range(64)}
        
        events_df = pd.DataFrame({
            "cpuid": cpuid,  
            "time": timestamp, 
            "period": period,
            "event": event,
            "virt": virtual_addr,
            "phys": physical_addr, 
            "latency": latency,
            "cache_result": cache_result,
        })
        events_df['time'] = events_df['time'] - initial_timestamp
        # accesses_df['time_offset'] = accesses_df['time'].diff()
        events_df['cpu_node'] = events_df['cpuid'].map(cpu_to_node)
        events_df['memory_node'] = (events_df['phys'] >= NODE_1_PHYS_ADDR_START).astype(int)
        return events_df
    
    def read_perf_data_with_latency(self, data_file_path: str, executable: Optional[str] = None, time_option: Optional[str] = None) -> pd.DataFrame:
        initial_timestamp = self.__extract_perf_data_file_with_latency(data_file_path, executable, time_option)
        df = self.__parse_events_with_latency(self.log_file_path, initial_timestamp)
        os.remove(self.log_file_path)
        return df
    
    