In [None]:
import pathlib
import pandas as pd
from typing import List, Dict, Any, Tuple
import matplotlib.pyplot as plt

In [None]:
RANK = 4


class ExpDir:
    def __init__(self, exp_dir: pathlib.Path):
        self.exp_dir = exp_dir

        # parse exp_dir name: <dataset>-<config>-<seq_len>
        self.name = str(exp_dir.name)

        # replace '--1' with '-0', special case for no window
        self.name = self.name.replace("--1", "-0")

        self.dataset, self.config, self.seq_len, self.model, self.window = (
            self.name.split("-")
        )
        self.seq_len = int(self.seq_len)
        self.rank_mem = self.read_mem()
        self.throughput = self.read_throughput()
        self.key = self._get_key()

    def __repr__(self):
        return f"ExpDir({self.exp_dir.name}), seq_len={self.seq_len}"

    @staticmethod
    def fmt_bytes2gb(res: str) -> float:
        # bytes to GB
        return round(float(res) / 1024 / 1024 / 1024, 3)

    @staticmethod
    def _read_file(res_file, formatter) -> Any:
        with open(res_file, "r") as f:
            res = f.read()
        return formatter(res)

    def read_mem(self) -> List[int]:
        res_list = []
        for rank in range(RANK):
            res_file = self.exp_dir / f"mem-{rank}.txt"
            # assert res_file.exists(), f'{res_file} not exists'
            if not res_file.exists():
                print(f"[warning]: {res_file} not exists, return empty list")
                return []
            res_list.append(self._read_file(res_file, self.fmt_bytes2gb))
        return res_list

    def read_throughput(self) -> float:
        res_file = self.exp_dir / "flops"
        # assert res_file.exists(), f'{res_file} not exists'
        if not res_file.exists():
            print(f"[warning]: {res_file} not exists, return 0 as throughput")
            return 0
        return self._read_file(res_file, float)

    def _get_key(self) -> Tuple:
        return self.dataset, self.config, self.seq_len, self.model

In [None]:
class ExpGroup:
    def __init__(self, exp_group_dir):
        self.exp_list = list()
        for exp_dir in exp_group_dir.iterdir():
            if not exp_dir.is_dir():
                continue
            exp = ExpDir(exp_dir)
            self.exp_list.append(exp)

        self.df = self._get_df()

    def _get_df(self) -> pd.DataFrame:
        df = pd.DataFrame([exp.rank_mem + [exp.throughput] for exp in self.exp_list])
        df.columns = [f"rank-{i}" for i in range(RANK)] + ["throughput"]
        df["seq_len"] = [exp.seq_len for exp in self.exp_list]
        df["window"] = [exp.window for exp in self.exp_list]
        df["model"] = [exp.model for exp in self.exp_list]
        df["dataset"] = [exp.dataset for exp in self.exp_list]
        df["config"] = [exp.config for exp in self.exp_list]

        # convert window to int
        df["window"] = df["window"].astype(int)
        df["seq_len"] = df["seq_len"].astype(int)
        df["throughput"] = df["throughput"].astype(float)
        return df

In [None]:
exp_group = ExpGroup(pathlib.Path("../output/20231115_1"))
df = exp_group.df
# group by seq_len and sort by window
df = df.sort_values(["seq_len", "window"])
# show only throughput, seq_len, window
df[["throughput", "seq_len", "window"]]

In [None]:
# Plotting
fig, ax = plt.subplots()

# Group by 'seq_len' and plot lines for each group
for seq_len, group in df.groupby("seq_len"):
    ax.plot(
        group["window"], group["throughput"], label=f"Seq Len {seq_len}", marker="o"
    )

# Set labels and title
ax.set_xlabel("Window")
ax.set_ylabel("Throughput")
ax.set_title("Throughput vs. Window for Different Seq Lengths")

# Show legend
ax.legend()
ax.grid()

# Show the plot
plt.show()