In [1]:
import os
import pandas
import pathlib
import numpy
import collections
import abc
import numpy.typing

# Generate data (not needed if loading data)

In [None]:
os.chdir("..")

In [536]:
import pandas
import functools
from typing import Mapping, Callable
from util import flatten1
from prov_collectors import PROV_COLLECTORS
from workloads import WORKLOADS
import operator

rel_qois = ["cputime", "walltime", "memory"]
abs_qois = ["storage", "n_ops", "n_unique_files"]
output = pathlib.Path("output")
output.mkdir(exist_ok=True)

In [450]:
from experiment import get_results
from workloads import WORKLOAD_GROUPS
from prov_collectors import PROV_COLLECTOR_GROUPS
from util import flatten1

collectors = list(flatten1([
    PROV_COLLECTOR_GROUPS[collector_name]
    for collector_name in ["fast"]
]))
workloads = list(flatten1([
    WORKLOAD_GROUPS[workload_name]
    for workload_name in ["working"]
]))
iterations = 4
ignore_failures = True
rerun = False
df = get_results(
    collectors,
    workloads,
    iterations=iterations,
    seed=0,
    ignore_failures=ignore_failures,
    rerun=rerun,
)

  0%|                                                                                                                               | 0/5000 [00:00<?, ?it/s]

 > Construct DataFrame: running


 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 4920/5000 [03:46<00:02, 32.56it/s]

 > Construct DataFrame > setup postmark: running
 > Construct DataFrame > setup postmark: 0.0s
 > Construct DataFrame > setup fsatrace: running
 > Construct DataFrame > setup fsatrace: 0.0s
 > Construct DataFrame > run postmark in fsatrace: running
 > Construct DataFrame > run postmark in fsatrace: 0.0s (err)


DEBUG:charmonium.logger: > Construct DataFrame > run postmark in fsatrace: 0.0s (err)


 > Construct DataFrame: 226.9s (err)


DEBUG:charmonium.logger: > Construct DataFrame: 226.9s (err)


DBusBaseError: [err -123]: Could not open a bus to DBus
This is DBusBaseError, a base error for DBus (i bet you did not see that coming) if you need a special error, enhance pystemd.sysdexc module!.

 98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 4920/5000 [04:01<00:02, 32.56it/s]

In [97]:
agged = (
    df
    .groupby(["collector", "workload"], observed=True, as_index=True)
    .agg(**{
        **{
            f"{qoi}_std": pandas.NamedAgg(qoi, "std")
            for qoi in abs_qois + rel_qois
        },
        **{
            f"{qoi}_mean": pandas.NamedAgg(qoi, "mean")
            for qoi in abs_qois + rel_qois
        },
        **{
            f"{qoi}_low": pandas.NamedAgg(qoi, lambda data: numpy.percentile(data, 5))
            for qoi in abs_qois + rel_qois
        },
        **{
            f"{qoi}_high": pandas.NamedAgg(qoi, lambda data: numpy.percentile(data, 95))
            for qoi in abs_qois + rel_qois
        },
        **{
            f"{qoi}_sorted": pandas.NamedAgg(qoi, lambda data: list(sorted(data)))
            for qoi in abs_qois + rel_qois
        },
        "op_type_counts_sum": pandas.NamedAgg("op_type_counts", lambda op_type_freqs: functools.reduce(operator.add, op_type_freqs, collections.Counter())),
        "count": pandas.NamedAgg("walltime", lambda walltimes: len(walltimes)),
    })
    .assign(**{
        **{
            f"{qoi}_rel": lambda df, qoi=qoi: df[f"{qoi}_std"] / df[f"{qoi}_mean"]
            for qoi in abs_qois + rel_qois
        },
        "rel_slowdown": lambda df: df["walltime_mean"] / df.loc["noprov"]["walltime_mean"],
    })
    .assign(**{
        "log_rel_slowdown": lambda df: numpy.log(df["rel_slowdown"]),
    })
)

## Feature engineering

In [98]:
import collections
all_syscalls = collections.Counter()
for counter in df[df["collector"] == "strace"]["op_type_counts"]:
    all_syscalls += counter
all_syscalls

Counter({'newfstatat': 16650554,
         'close': 10038172,
         'openat': 9253868,
         'unlink': 2820321,
         'creat': 1267063,
         'readlink': 1092232,
         'utimensat': 789797,
         'exit_group': 261135,
         'clone': 256629,
         'access': 185192,
         'execve': 133982,
         'connect': 48001,
         'mkdirat': 45008,
         'dup2': 21375,
         'mkdir': 19360,
         'rmdir': 16841,
         'clone3': 14702,
         'pipe2': 12540,
         'rename': 12121,
         'exit': 11301,
         'chdir': 8444,
         'vfork': 4944,
         'accept': 2675,
         'chmod': 2641,
         'fchmod': 2311,
         'dup': 2238,
         'bind': 2153,
         'symlink': 513,
         'accept4': 22,
         'link': 9,
         'ftruncate': 7})

In [515]:
syscall_groups = {
    "socket": {"accept4", "connect", "bind", "accept"},
    "file": {"newfstatat", "readlink", "access", "chmod", "fchmod", "mkdir", "rmdir", "mkdirat", "rename", "unlink", "link", "symlink"},
    "fd": {"creat", "open", "openat"},
    "clone": {"clone", "clone3"},
    "fork": {"vfork"},
    "exec": {"clone", "clone3"},
}
syscall_groups["other"] = {
    syscall
    for syscall in all_syscalls
    if not any(syscall in group for group in syscall_groups.values())
}
syscall_groups

{'socket': {'accept', 'accept4', 'bind', 'connect'},
 'file': {'access',
  'chmod',
  'fchmod',
  'link',
  'mkdir',
  'mkdirat',
  'newfstatat',
  'readlink',
  'rename',
  'rmdir',
  'symlink',
  'unlink'},
 'fd': {'creat', 'open', 'openat'},
 'clone': {'clone', 'clone3'},
 'fork': {'vfork'},
 'exec': {'clone', 'clone3'},
 'other': {'chdir',
  'close',
  'dup',
  'dup2',
  'execve',
  'exit',
  'exit_group',
  'ftruncate',
  'pipe2',
  'utimensat'}}

In [516]:
syscall_groups = {
    "socket": {"accept4", "connect", "bind", "accept"},
    "metadata": {"newfstatat", "access"},
    "chmod": {"chmod", "fchmod"},
    "dir": {"mkdir", "rmdir", "mkdirat", "rename", "unlink", "link", "readlink", "symlink"},
    "file": {"creat", "open", "openat"},
    "exec": {"execve", "vfork"},
    "clone": {"clone", "clone3"},
    "exits": {"exit", "exit_group"},
    "dups": {"dup", "dup2"},
}
syscall_groups["other"] = {
    syscall
    for syscall in all_syscalls
    if not any(syscall in group for group in syscall_groups.values())
}
syscall_groups

{'socket': {'accept', 'accept4', 'bind', 'connect'},
 'metadata': {'access', 'newfstatat'},
 'chmod': {'chmod', 'fchmod'},
 'dir': {'link',
  'mkdir',
  'mkdirat',
  'readlink',
  'rename',
  'rmdir',
  'symlink',
  'unlink'},
 'file': {'creat', 'open', 'openat'},
 'exec': {'execve', 'vfork'},
 'clone': {'clone', 'clone3'},
 'exits': {'exit', 'exit_group'},
 'dups': {'dup', 'dup2'},
 'other': {'chdir', 'close', 'ftruncate', 'pipe2', 'utimensat'}}

In [517]:
noprov = agged.loc["noprov"]
strace = agged.loc["strace"]
features_df = pandas.DataFrame({
    "cputime_per_sec": noprov["cputime_mean"] / noprov["walltime_mean"],
    "memory_mean": noprov["memory_mean"],
    **{
        group_name + "_syscalls_per_sec": strace["op_type_counts_sum"].map(lambda op_type_counts: sum(
            op_type_counts[syscall_name]
            for syscall_name in syscall_names
        )) / (noprov["walltime_mean"] * noprov["count"])
        for group_name, syscall_names in syscall_groups.items()
    },
    "n_ops_per_sec": strace["n_ops_mean"] / noprov["walltime_mean"],
})
features_df

Unnamed: 0_level_0,cputime_per_sec,memory_mean,socket_syscalls_per_sec,metadata_syscalls_per_sec,chmod_syscalls_per_sec,dir_syscalls_per_sec,file_syscalls_per_sec,exec_syscalls_per_sec,clone_syscalls_per_sec,exits_syscalls_per_sec,dups_syscalls_per_sec,other_syscalls_per_sec,n_ops_per_sec
workload,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
a-data-sci,1.846697,5.019648e+08,1.505072,1822.089894,0.082848,1.491264,722.116985,4.473792,1.242720,1.132256,1.118448,474.014838,3029.268116
archive,0.641795,8.645291e+06,299.254091,122776.062947,0.000000,0.000000,41635.028198,75.930143,75.185729,76.178280,0.744413,40435.778592,205374.162394
archive bzip2,1.012189,9.310208e+06,18.078526,3815.702057,0.000000,0.000000,1370.741457,8.965373,4.507317,4.605837,7.241263,1267.073177,6496.915007
archive gzip,1.026436,3.751936e+06,85.001375,14739.397643,0.000000,0.000000,5254.341122,40.895011,17.165106,17.278782,36.149030,4924.566478,25114.794549
archive pbzip2,5.782112,5.392111e+07,53.239038,10807.663452,0.000000,0.000000,3970.368904,25.335396,81.420327,84.890930,19.851845,3614.632180,18657.402070
...,...,...,...,...,...,...,...,...,...,...,...,...,...
unarchive bzip2,1.085411,1.425408e+07,26.959380,3829.953241,0.000000,201.189403,3604.576401,13.569107,13.479690,13.703234,13.502044,7011.249493,14728.181994
unarchive gzip,1.000682,1.138278e+07,116.492802,11320.507291,0.000000,580.528912,10481.126988,58.439910,38.895437,39.475966,58.246401,20418.685404,43112.399111
unarchive pbzip2,3.238676,4.018859e+07,52.587740,6911.684236,0.000000,356.709607,6608.789943,23.997116,154.555684,198.471198,23.917918,12540.948330,26871.661772
unarchive pigz,0.933855,1.131042e+07,86.947520,12339.482427,0.000000,648.862088,11706.481413,43.690047,108.359969,154.140794,43.473760,22659.633941,47791.071958


## Actually creating Numpy arrays

In [518]:
tmp_df = agged.reset_index().pivot(index="collector", columns="workload", values="log_rel_slowdown")
noprov_index = list(agged.index.levels[0]).index("noprov")
strace_index = list(agged.index.levels[0]).index("strace")

assert all(
    workload0 == workload1
    for workload0, workload1 in zip(tmp_df.columns, features_df.index)
)

systems_by_benchmarks = tmp_df.values
benchmarks_by_features = features_df.values


collector_names = agged.index.levels[0]
benchmark_names = agged.index.levels[1]
feature_names = features_df.columns


n_systems, n_benchmarks = systems_by_benchmarks.shape
_, n_features = benchmarks_by_features.shape

In [None]:
systems_by_benchmarks = numpy.load("output/systems_by_benchmarks.npy")
benchmarks_by_features = numpy.load("output/benchmarks_by_features.npy")
collector_names = pathlib.Path("output/collectors.txt").read_text().split("\n")
benchmark_names = pathlib.Path("output/benchmark_names.txt").read_text().split("\n")
feature_names = pathlib.Path("output/feature_names.txt").read_text().split("\n")


n_systems, n_benchmarks = systems_by_benchmarks.shape
_, n_features = benchmarks_by_features.shape

# Let's play the new system game

It is more of a dialogue.

- **Given** integer N and workloads x benchmark matrix
- **Select** N workloads
- **Given** new system's log slowdown ratio on N selected workloads
- **Predict** new system's log slowdown ratio on all other workloads

I initially scored this game by cross-validated root-mean-squared-error. However, I've found that even with cross-validation, more complex models are not "punished" enough. So I decided to also include Akaike Information Criterion (modified for small sample size). But in order to compute the AIC, one has to know the likelihood function. If you're lazy, your model has uninformitave priors, and your errors are normally distributed (although with unknown variance), you can use `naive_log_likelihood`, which computes the likelihood-maximizing variance, and returns the likelihood of the data based on that.

In [520]:
class NewSystemPredictor:
    @abc.abstractmethod
    def select_benchmarks(
        self,
        k: int,
        systems_by_benchmarks: numpy.typing.NDArray,
        benchmarks_by_features: numpy.typing.NDArray,
    ) -> tuple[list[int], numpy.float64]:
        """
        k: number of benchmarks to select
        systems_by_benchmarks: array where the (i,j)th element is the log of the ith system's slowdown on the jth benchmark
        benchmarks_by_features: array where the (j,m)th element is the mth feature of the jth benchmark

        returns a tuple containing:
          - k benchmarks to select
          - the log-liklihood
        
        Liklihood is the probability of observing this data given the parameters you inferred
        Used to compute the Akaike Information Criterion.
        Return numpy.NaN if you just don't care.
        """

    @abc.abstractmethod
    def predict_new_system(
        new_systems_by_selected_benchmarks: numpy.typing.NDArray,
    ) -> numpy.typing.NDArray:
        """
        new_system_by_selected_benchmarks: array where the (g,p)th element is the log of the gth new system's slowdown on the pth *selected* benchmark

        returns an array where the (g,q)th element is the log of the gth new system's slowdown on the qth *unselected* benchmark
        """

    @abc.abstractmethod
    def n_parameters(self) -> int:
        """
        Returns the number of parameters used to make this estimation.
        Used to calculate the Akaike Information Criterion.
        """

In [523]:
import sklearn.model_selection


mean_absolute_error = lambda a, b: numpy.mean(numpy.fabs(a - b))

root_mean_squared_error = lambda a, b: numpy.sqrt(numpy.mean((a-b)**2))


def aicc(k: int, log_likelihood: float, n_points: int) -> float:
    aic = 2 * k - 2 * log_likelihood
    return aic + (2 * k**2 + 2 * k) / (n_points - k - 1)


def test_system_predictors(
    predictors: list[NewSystemPredictor]
) -> None:
    systems = list(range(n_systems))
    cv_splitter = sklearn.model_selection.LeaveOneOut()
    print("RMSE (lower is better), stddev RMSE (lower is better), AICc (higher is better)")
    for predictor in predictors:
        results = []
        selected = collections.Counter()
        for train_systems, test_systems in cv_splitter.split(systems):
            selected_benchmarks, _ = predictor.select_benchmarks(
                systems_by_benchmarks[train_systems],
                benchmarks_by_features,
            )
            unselected_benchmarks = [
                benchmark
                for benchmark in range(n_benchmarks)
                if benchmark not in selected_benchmarks
            ]
            predicted = predictor.predict_new_systems(
                systems_by_benchmarks[test_systems, :][:, selected_benchmarks],
            )
            for benchmark in selected_benchmarks:
                selected[benchmark] += 1
            actual = systems_by_benchmarks[test_systems, :][:, unselected_benchmarks]
            results.append(root_mean_squared_error(actual, predicted))
        result_mean = numpy.mean(results)
        result_std = numpy.std(results)
        _, log_likelihood = predictor.select_benchmarks(systems_by_benchmarks, benchmarks_by_features)
        n_datapoints = len(systems_by_benchmarks.flatten()) + len(benchmarks_by_features.flatten())
        my_aicc = aicc(predictor.n_parameters(), log_likelihood, n_datapoints)
        print(
            f"{result_mean:.2f}",
            f"{result_std:.2f}",
            f"{numpy.log(my_aicc):.2f}",
            predictor,
            {benchmark_names[benchmark]: count for benchmark, count in selected.items()},
        )

Assume a model predicts $\hat{r}_i = f(r_i) + \eta_i$ where $\eta_i$ is normally distributed around 0 with unknown variance.

Let's find the variance which maximizes likelihood.

First, I'll write down the PDF (which is likelihood function) for the Normal distribution, where $\mu$ is the prediction and $x$ is the observation:

$$f(x | \mu, \sigma) = \frac{1}{\sigma * \sqrt{2\pi}) * \exp(-\frac{1}{2} \left( \frac{x-\mu}{\sigma} \right^2 $$

Then log both sides.

$$\log f(x | \mu, \sigma) = -\log(\sigma \sqrt{2\pi}) - \frac{1}{2} \left( \frac{x-\mu}{\sigma} \right)^2 $$

In order to maximize, take a derivative with $\sigma$.

$$\frac{d}{d\sigma} \log f(x | \mu, \sigma) = -\frac{1}{\sigma} + \frac{(x - \mu)^2}{\sigma^3} $$

Set that to zero.

$$0 = \frac{d}{d\sigma} \log f(x | \mu, \sigma) \implies \frac{1}{\sigma} = \frac{(x - \mu)^2}{\sigma^3} \implies \sigma = |x - \mu|$$

Therefore $\log f(x | \mu, \sigma)$ has a maximum at $\sigma = x - \mu|$.

We can plug that back in to the log-likelihood function.

In [540]:
def naive_log_likelihood(actual, predicted) -> float:
    std = numpy.clip(actual - predicted, 1e-2, None)
    # Some of the values we hit "dead-on"
    # This predicts the sigma should be 0, which is wrong
    # It should actually be a small positive number.
    
    # Plugging this in to the log PDF above
    return numpy.sum(-1/2*((actual - predicted)/std)**2 - numpy.log(std) + 1/2*numpy.log(2*numpy.pi))

In [541]:
import scipy.linalg.interpolative

class InitialSystemPredictor(NewSystemPredictor):
    """
    This is a simple predictor just to test the mechanics.

    It simply selects self.benchmarks.
    Then it runs a regression to all the unselected benchmarks based on the selected benchmarks.
    That's it.
    """
    def __init__(self, benchmarks: list[int]) -> None:
        self.benchmarks = benchmarks

    def select_benchmarks(
        self,
        systems_by_benchmarks: numpy.typing.NDArray,
        benchmarks_by_features: numpy.typing.NDArray,
    ) -> tuple[list[int], numpy.float64]:
        unselected_benchmarks = [
            benchmark
            for benchmark in range(systems_by_benchmarks.shape[1])
            if benchmark not in self.benchmarks
        ]
        self.coeffs = numpy.linalg.pinv(systems_by_benchmarks[:, self.benchmarks]) @ systems_by_benchmarks[:, unselected_benchmarks]
        log_likelihood = naive_log_likelihood(
            systems_by_benchmarks[:, unselected_benchmarks],
            systems_by_benchmarks[:, self.benchmarks] @ self.coeffs,
        )
        # the selected benchmarks will get probability = 1, log likelihood = 0, so you can imagine I wrote ... + 0 + 0 + 0 to the end
        return self.benchmarks, log_likelihood

    def predict_new_systems(
        self,
        new_systems_by_selected_benchmarks: numpy.typing.NDArray,
    ) -> numpy.typing.NDArray:
        return new_systems_by_selected_benchmarks @ self.coeffs

    def n_parameters(self) -> int:
        return len(self.coeffs)

    def __str__(self) -> str:
        return f"{self.__class__.__name__}({self.benchmarks!r})"

In [538]:
import scipy.linalg.interpolative

class InterpolativeDecompositionSystemPredictor(NewSystemPredictor):
    """
    This method uses Interpolative Decomposition (ID).

    ID factors a matrix A into B @ C.
    It selects k columns of A, and puts those in B.
    It puts the identity matrix in the corresponding columns of C.
    The remaining N - k columns of A are predicted from a linear regression on the k columns of A (equivalently, all the columns of B).

    This method should be pretty good.
    """
    def __init__(self, k: int, use_features: bool) -> None:
        self.k = k
        self.use_features = use_features

    def select_benchmarks(
        self,
        systems_by_benchmarks: numpy.typing.NDArray,
        benchmarks_by_features: numpy.typing.NDArray,
    ) -> tuple[list[int], numpy.float64]:
        if self.use_features:
            data = numpy.vstack([
                systems_by_benchmarks,
                benchmarks_by_features.T,
            ])
        else:
            data = systems_by_benchmarks
        idx, proj = scipy.linalg.interpolative.interp_decomp(data, self.k, rand=False)
        self.idx = idx
        self.proj = proj
        # skel = scipy.linalg.interpolative.reconstruct_skel_matrix(data, self.k, idx)
        # data_est = scipy.linalg.interpolative.reconstruct_matrix_from_id(skel, idx, proj)[:len(systems_by_benchmarks), :]
        log_likelihood = naive_log_likelihood(
            systems_by_benchmarks[:, self.idx[self.k:]],
            systems_by_benchmarks[:, self.idx[:self.k]] @ self.proj,
        )
        return idx[:self.k], log_likelihood

    def predict_new_systems(
        self,
        new_systems_by_selected_benchmarks: numpy.typing.NDArray,
    ) -> numpy.typing.NDArray:
        return new_systems_by_selected_benchmarks @ self.proj

    def n_parameters(self) -> int:
        return len(self.proj.flatten())

    def __str__(self) -> str:
        return f"{self.__class__.__name__}({self.k}, {self.use_features})"

In [535]:
import scipy.linalg.interpolative

class ClusteringSystemPredictor(NewSystemPredictor):
    def __init__(self, k: int, use_features: bool, whiten: bool) -> None:
        self.k = k
        self.use_features = use_features
        self.whiten = whiten

    def select_benchmarks(
        self,
        systems_by_benchmarks: numpy.typing.NDArray,
        benchmarks_by_features: numpy.typing.NDArray,
    ) -> tuple[list[int], numpy.float64]:
        if self.use_features:
            data = numpy.vstack([
                systems_by_benchmarks,
                benchmarks_by_features.T,
            ])
        else:
            data = systems_by_benchmarks
        self.pca = sklearn.decomposition.PCA(
            n_components=self.k,
            whiten=self.whiten,
            random_state=0,
        )
        self.rotated_data = self.pca.fit_transform(data)
        self.kmeans = sklearn.cluster.KMeans(n_clusters=self.k, random_state=0)
        self.kmeans.fit(small_data)
        self.selected_benchmarks = [
            min(
                self.rotated_data.T,
                key=lambda benchmark: scipy.spatial.distance.euclidean(benchmark, cluster_center),
            )
            for cluster_center in self.kmeans.cluster_centers_
        ]
        unselected_benchmarks = [
            benchmark
            for benchmark in range(systems_by_benchmarks.shape[1])
            if benchmark not in self.selected_benchmarks
        ]
        log_likelihood = naive_log_likelihood(
            systems_by_benchmarks[:, unselected_benchmarks],
            systems_by_benchmarks[:, selected_benchmarks],
        )
        return idx[:self.k], log_likelihood

    def plot(self, ax: "maptloltib.axes.Axes") -> None:
        import matplotlib.cm
        colors = sorted(matplotlib.cm.Dark2.colors + matplotlib.cm.Set2.colors)
        for i in range(len(self.rotated_data)):
            ax.plot(
                self.rotated_data[i, 0],
                self.rotated_data[i, 1], 
                color=colors[self.kmeans.labels_[i]],
                marker="o",
            )

    def predict_new_systems(
        self,
        new_systems_by_selected_benchmarks: numpy.typing.NDArray,
    ) -> numpy.typing.NDArray:
        return new_systems_by_selected_benchmarks @ self.proj

    def n_parameters(self) -> int:
        return len(self.proj.flatten())

    def __str__(self) -> str:
        return f"{self.__class__.__name__}({self.k}, {self.use_features})"

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 21)

In [542]:
%%html
<!-- Disable line wrapping in cell outputs to make the output more readable -->
<style>
div.jp-OutputArea-output pre {
    white-space: pre;
}
</style>

In [543]:
test_system_predictors([
    InitialSystemPredictor([0]),
    InitialSystemPredictor([0, 1]),
    InitialSystemPredictor([10, 20, 30]),
    *[
        InterpolativeDecompositionSystemPredictor(k, True)
        for k in range(1, n_systems + n_features)
    ],
    *[
        InterpolativeDecompositionSystemPredictor(k, False)
        for k in range(1, n_systems)
    ],
])

1.06 1.21 11.62 InitialSystemPredictor([0]) {'a-data-sci': 5}
1.12 0.90 14.09 InitialSystemPredictor([0, 1]) {'a-data-sci': 5, 'archive': 5}
1.26 1.06 11.68 InitialSystemPredictor([10, 20, 30]) {'blastn-NM_003949': 5, 'blastn-NM_024506': 5, 'blastn-NM_068205': 5}
0.56 0.51 11.97 InterpolativeDecompositionSystemPredictor(1, True) {'megablast-NG_008953': 5}
0.53 0.50 11.71 InterpolativeDecompositionSystemPredictor(2, True) {'megablast-NG_008953': 5, 'postmark': 5}
0.52 0.50 11.65 InterpolativeDecompositionSystemPredictor(3, True) {'megablast-NG_008953': 5, 'postmark': 5, 'archive': 5}
0.53 0.50 11.81 InterpolativeDecompositionSystemPredictor(4, True) {'megablast-NG_008953': 5, 'postmark': 5, 'archive': 5, 'gcc-hello-world': 5}
0.53 0.51 11.71 InterpolativeDecompositionSystemPredictor(5, True) {'megablast-NG_008953': 5, 'postmark': 5, 'archive': 5, 'gcc-hello-world': 5, 'true': 5}
0.54 0.51 11.70 InterpolativeDecompositionSystemPredictor(6, True) {'megablast-NG_008953': 5, 'postmark': 5, 

  f"{numpy.log(my_aicc):.2f}",


In [529]:
def test_benchmark_predictors(
    predictors: list[NewBenchmarkPredictor],
) -> None:
    benchmarks = list(range(n_benchmarks))
    test_size = 0.1
    cv_splitter = sklearn.model_selection.ShuffleSplit(n_splits=10, test_size=test_size, random_state=0)
    for predictor in predictors:
        results = []
        for train_benchmarks, test_benchmarks in cv_splitter.split(benchmarks):
            predicted, _ = predictor.predict_new_benchmark(
                systems_by_benchmarks[:, train_benchmarks],
                benchmarks_by_features[train_benchmarks, :],
                benchmarks_by_features[test_benchmarks, :],
            )
            actual = systems_by_benchmarks[:, test_benchmarks]
            results.append(root_mean_squared_error(actual, predicted))
        result_mean = numpy.mean(results)
        result_std = numpy.std(results)
        _, log_likelihood = predictor.predict_new_benchmark(systems_by_benchmarks, benchmarks_by_features, benchmarks_by_features)
        n_datapoints = len(systems_by_benchmarks.flatten()) + len(benchmarks_by_features.flatten())
        my_aicc = aicc(predictor.n_parameters(), log_likelihood, n_datapoints)
        print(
            f"{result_mean:.2f}",
            f"{result_std:.2f}",
            f"{numpy.log(my_aicc):.2f}",
            predictor,
        )

In [546]:
import scipy.linalg.interpolative

class Regression(NewBenchmarkPredictor):
    """
    This method simply regresses performance on the full set of featuers.

    No linear method should be able to do better in RMSE, but dimensionality reduction may help with AIC.
    """
    def n_parameters(self) -> int:
        return len(self.systems_by_features.flatten())

    def predict_new_benchmark(
        self,
        systems_by_benchmarks: numpy.typing.NDArray,
        benchmarks_by_features: numpy.typing.NDArray,
        new_benchmarks_by_features: numpy.typing.NDArray,
    ) -> tuple[numpy.typing.NDArray, numpy.float64]:
        self.systems_by_features = systems_by_benchmarks @ numpy.linalg.pinv(benchmarks_by_features.T)
        log_likelihood = naive_log_likelihood(
            self.systems_by_features @ benchmarks_by_features.T,
            systems_by_benchmarks,
        )
        return self.systems_by_features @ new_benchmarks_by_features.T, log_likelihood

    def __str__(self) -> str:
        return f"{self.__class__.__name__}()"

In [547]:
import scipy.linalg.interpolative

class LowRankMatrixFactorization(NewBenchmarkPredictor):
    """
    Like Regression, but use a low-rank compression
    """
    def __init__(self, dim: int) -> None:
        self.dim = dim

    def n_parameters(self) -> int:
        return len(self.a.flatten()) + len(self.b.flatten())

    def predict_new_benchmark(
        self,
        systems_by_benchmarks: numpy.typing.NDArray,
        benchmarks_by_features: numpy.typing.NDArray,
        new_benchmarks_by_features: numpy.typing.NDArray,
    ) -> tuple[numpy.typing.NDArray, numpy.float64]:
        tmp = systems_by_benchmarks @ numpy.linalg.pinv(benchmarks_by_features.T)
        u, s, vh = numpy.linalg.svd(tmp, full_matrices=False)
        self.a = (u[:, :self.dim] * s[:self.dim])
        self.b = vh[:self.dim, :]
        log_likelihood = naive_log_likelihood(
            self.a @ self.b @ benchmarks_by_features.T,
            systems_by_benchmarks,
        )
        return self.a @ self.b @ new_benchmarks_by_features.T, log_likelihood

    def __str__(self) -> str:
        return f"{self.__class__.__name__}({self.dim})"

In [548]:
import scipy.linalg.interpolative

class GreedySubsetMatrixFactorization(NewBenchmarkPredictor):
    """
    This method tries to select only dim features.

    This is subtly different from "compressing to a matrix of rank dim".

    Using only dim features, means the other coefficients **have to be** zero.

    It's greedy because it picks the best feature, and adds next best given the current set, etc.
    """
    def __init__(self, dim: int) -> None:
        self.dim = dim

    def n_parameters(self) -> int:
        return len(self.systems_by_features.flatten())

    def predict_new_benchmark(
        self,
        systems_by_benchmarks: numpy.typing.NDArray,
        benchmarks_by_features: numpy.typing.NDArray,
        new_benchmarks_by_features: numpy.typing.NDArray,
    ) -> tuple[numpy.typing.NDArray, numpy.float64]:
        def test_goodness(features: list[int]) -> numpy.float64:
            systems_by_features = systems_by_benchmarks @ numpy.linalg.pinv(benchmarks_by_features[:, features].T)
            return numpy.sum((systems_by_features @ benchmarks_by_features[:, features].T - systems_by_benchmarks)**2)
        selected_features = []
        while len(selected_features) < self.dim:
            unselected_features = [
                feature
                for feature in range(benchmarks_by_features.shape[1])
                if feature not in selected_features
            ]
            selected_features = max([
                selected_features + [candidate_feature]
                for candidate_feature in unselected_features
            ], key=test_goodness)
        self.features = selected_features
        self.systems_by_features = systems_by_benchmarks @ numpy.linalg.pinv(benchmarks_by_features[:, self.features].T)
        log_likelihood = naive_log_likelihood(
            self.systems_by_features @ benchmarks_by_features[:, self.features].T,
            systems_by_benchmarks
        )
        return self.systems_by_features @ new_benchmarks_by_features[:, self.features].T, log_likelihood

    def __str__(self) -> str:
        features = ", ".join(feature_names[i] for i in self.features)
        return f"{self.__class__.__name__}({self.dim}): {features}"

In [533]:
test_benchmark_predictors([
    Regression(),
    *[
        LowRankMatrixFactorization(i)
        for i in range(1, n_features + 1)
    ],
    *[
        GreedySubsetMatrixFactorization(i)
        for i in range(1, n_features + 1)
    ],
])

0.59 0.33 13.99 MatrixFactorization()
0.60 0.31 14.25 LowRankMatrixFactorization(1)
0.59 0.31 14.01 LowRankMatrixFactorization(2)
0.59 0.33 14.04 LowRankMatrixFactorization(3)
0.59 0.33 13.99 LowRankMatrixFactorization(4)
0.59 0.33 13.99 LowRankMatrixFactorization(5)
0.59 0.33 13.99 LowRankMatrixFactorization(6)
0.59 0.33 13.99 LowRankMatrixFactorization(7)
0.59 0.33 13.99 LowRankMatrixFactorization(8)
0.59 0.33 13.99 LowRankMatrixFactorization(9)
0.59 0.33 13.99 LowRankMatrixFactorization(10)
0.59 0.33 13.99 LowRankMatrixFactorization(11)
0.59 0.33 13.99 LowRankMatrixFactorization(12)
0.59 0.33 13.99 LowRankMatrixFactorization(13)
0.74 0.09 15.73 GreedySubsetMatrixFactorization(1): chmod_syscalls_per_sec
0.73 0.08 15.69 GreedySubsetMatrixFactorization(2): chmod_syscalls_per_sec, memory_mean
0.71 0.07 15.64 GreedySubsetMatrixFactorization(3): chmod_syscalls_per_sec, memory_mean, clone_syscalls_per_sec
0.71 0.07 15.63 GreedySubsetMatrixFactorization(4): chmod_syscalls_per_sec, memory_me

# What about new-system-and-benchmark?

- **Given** integer N and workloads x benchmark matrix
- **Select** N workloads
- **Given** new system's log slowdown ratio on N selected workloads and features of new workload
- **Predict** new system's log slowdown ratio on new workload

Not implemented yet. Maybe won't be ever. Who would be picking a new system and new benchmark at the same time?

In [None]:
# Ignore this clase

class NewSystemAndBenchmarkProblem(abc.ABC):

    @abc.abstractmethod
    def select_benchmarks(
        self,
        k: int,
        systems_by_benchmarks: numpy.typing.NDArray,
        benchmarks_by_features: numpy.typing.NDArray,
    ) -> tuple[list[int], numpy.float64]:
        """
        k: number of benchmarks to select
        systems_by_benchmarks: array where the (i,j)th element is the log of the ith system's slowdown on the jth benchmark
        benchmarks_by_features: array where the (j,m)th element is the mth feature of the jth benchmark


        returns a tuple containing:
          - k benchmarks to select
          - the log-liklihood
        
        Liklihood is the probability of observing this data given the parameters you inferred
        Used to compute the Akaike Information Criterion.
        Return numpy.NaN if you just don't care.     
        """
        pass

    @abc.abstractmethod
    def n_parameters(self) -> int:
        """
        Returns the number of parameters used to make this estimation.
        Used to calculate the Akaike Information Criterion.
        """

    @abc.abstractmethod
    def predict_new_system_on_new_benchmarks(
        self,
        new_system_by_selected_benchmarks: numpy.typing.NDArray,
        selected_benchmarks_by_features: numpy.typing.NDArray,
        new_benchmarks_by_features: numpy.typing.NDArray,
    ) -> numpy.typing.NDArray:
        """
        new_system_by_selected_benchmarks: array where the qth element is the log of the new system's slowdown on the qth selected benchmark
        selected_benchmarks_by_features: array where the (q,m)th element is the mth feature of the qth selected benchmark
        new_benchmarks_by_features: array where the (p,m)th element is the mth feature of the pth new benchmark

        returns an array where the pth element is the log slowdown of the new system on the pth new benchmark
        """
        pass