# The Language Model

## 1. Demonstration

### Source Rewriter

Given the following example code (taken from Nvidia’s streamcluster benchmark):

In [1]:
code = """//#define Elements
__kernel void memset_kernel(__global char * mem_d, short val, int number_bytes){
    const int thread_id = get_global_id(0);
    mem_d[thread_id] = val;
}"""

print(code)

//#define Elements
__kernel void memset_kernel(__global char * mem_d, short val, int number_bytes){
    const int thread_id = get_global_id(0);
    mem_d[thread_id] = val;
}


We apply the rewriter. Variable and function names are normalized, comments removed, and code style enforced:

In [2]:
from clgen import preprocess

rewritten = preprocess(code)
print(rewritten)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


__kernel void A(__global char* a, short b, int c) {
  const int d = get_global_id(0);
  a[d] = b;
}


### Source Encoder

Deriving a 1-of-$k$ vocabulary for a piece of code, using a hybrid character and token based approach:

In [3]:
from clgen._atomizer import GreedyAtomizer
from clgen._langs import Language


atomizer = GreedyAtomizer.from_text(lang=Language.from_str("opencl"), text=rewritten)
print(atomizer)

GreedyAtomizer[27 tokens]


The derived vocabulary maps tokens to indices:

In [4]:
import pandas as pd

pd.DataFrame(sorted([f"'{k}'" for k in atomizer.vocab]), columns=["token"])

Unnamed: 0,token
0,'\n'
1,' '
2,' '
3,'('
4,')'
5,'*'
6,"','"
7,'0'
8,';'
9,'='


Encoding the source using this vocabulary yields:

In [5]:
encoded = atomizer.atomize(rewritten)
print(encoded)

[14  1 24  1 10  3 13  1 18  5  1 15  6  1 23  1 16  6  1 22  1 17  4  1
 25  0  2 19  1 22  1 20  1  9  1 21  3  7  4  8  0  2 15 11 20 12  1  9
  1 16  8  0 26]


Reversing the process:

In [6]:
for i in encoded:
    t = atomizer.deatomize([i])
    if t == '\n': t = '\\n'
    print(f"<{t}>", end="")

<__kernel>< ><void>< ><A><(><__global>< ><char><*>< ><a><,>< ><short>< ><b><,>< ><int>< ><c><)>< ><{><\n><  ><const>< ><int>< ><d>< ><=>< ><get_global_id><(><0><)><;><\n><  ><a><[><d><]>< ><=>< ><b><;><\n><}>

### Padding

Sequences are padded to a fixed length using an out-of-vocabulary token:

In [7]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

pad_val = atomizer.vocab_size
print(pad_sequences([encoded], maxlen=len(encoded) + 22, value=pad_val)[0])

[27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 14  1
 24  1 10  3 13  1 18  5  1 15  6  1 23  1 16  6  1 22  1 17  4  1 25  0
  2 19  1 22  1 20  1  9  1 21  3  7  4  8  0  2 15 11 20 12  1  9  1 16
  8  0 26]


### Setup

In [8]:
def encode_srcs(srcs):
    """ encode and pad source code for learning """
    from keras.preprocessing.sequence import pad_sequences
    
    # take rewritten and turn to tokens
    seqs = [atomizer.atomize(src) for src in srcs]
    # get pad val
    pad_val = atomizer.vocab_size
    # add padding to match expected input size for model
    encoded = np.array(pad_sequences(seqs, maxlen=1024, value=pad_val))
    return np.vstack([np.expand_dims(x, axis=0) for x in encoded])

def grewe_features(df: pd.DataFrame) -> np.array:
    """ extract Grewe et al. feature vector from runtime data """
    return np.array([
            (df["transfer"].values / (df["comp"].values + df["mem"].values)),  # F1
            (df["coalesced"].values / df["mem"].values),  # F2
            ((df["localmem"].values / df["mem"].values) * df["wgsize"].values),  # F3
            (df["comp"].values / df["mem"].values),  # F4
        ]).T

def auxiliary_inputs(df: pd.DataFrame) -> np.array:
    """ get dsize and wgsize auxiliary inputs """
    return np.array([
        df["transfer"].values,
        df["wgsize"].values,
    ]).T

from typing import List

def platform2str(platform: str) -> str:
    """ get full platform name """
    if platform == "amd":
        return "AMD Tahiti 7970"
    elif platform == "nvidia":
        return "NVIDIA GTX 970"
    else:
        raise LookupException

def encode_1hot(y: np.array) -> np.array:
    """ 1-hot encode labels """
    labels = np.vstack([np.expand_dims(x, axis=0) for x in y])
    l2 = [x[0] for x in labels]
    l1 = [not x for x in l2]
    return np.array(list(zip(l1, l2)), dtype=np.int32)

def escape_benchmark_name(g: str) -> str:
    """escape benchmark name for display"""
    c = g.split('-')
    return escape_suite_name(c[0]).split()[0] + "." + c[-2]

def escape_suite_name(g: str) -> str:
    """ format benchmark suite name for display """
    c = g.split('-')
    if c[0] == "amd" or c[0] == "nvidia":
        return c[0].upper() + " SDK"
    if c[0] == "npb" or c[0] == "shoc":
        return c[0].upper()
    elif c[0] == "parboil" or c[0] == "polybench" or c[0] == "rodinia":
        return c[0].capitalize()
    else:
        raise LookupError

In [9]:
import numpy as np
from clgen import _atomizer as clgen

class HeterogemeousMappingModel(object):
    """
    A model for predicting OpenCL heterogeneous device mappings.
    
    Attributes
    ----------
    __name__ : str
        Model name
    __basename__ : str
        Shortened name, used for files
    """
    __name__ = None
    __basename__ = None
    
    def init(self, seed: int) -> None:
        """
        Initialize the model.
        
        Do whatever is required to setup a new heterogeneous model here.
        This method is called prior to training and predicting.
        This method may be omitted if no initial setup is required.
        
        Parameters
        ----------
        seed : int
            The seed value used to reproducible results. May be 'None',
            indicating that no seed is to be used.
        """
        pass
    
    def save(self, outpath: str) -> None:
        """
        Save model state.
        
        This must capture all of the relevant state of the model. It is up
        to implementing classes to determine how best to save the model.
        
        Parameters
        ----------
        outpath : str
            The path to save the model state to.
        """
        raise NotImplementedError
    
    def restore(self, inpath: str) -> None:
        """
        Load a trained model from file.
        
        This is called in place of init() if a saved model file exists. It
        must restore all of the required model state.
    
        Parameters
        ----------
        inpath : str
            The path to load the model from. This is the same path as
            was passed to save() to create the file.
        """
        raise NotImplementedError
        
    def train(self, df: pd.DataFrame, features: np.array, sequences: np.array,
              y: np.array, y_1hot: np.array, verbose: bool=False) -> None:
        """
        Train a model.
        
        Parameters
        ----------
        df : pd.DataFrame
            The platform dataframe.
        
        features : np.array
            An array of feature vectors of shape (n,4).
        
        sequences : np.array
            An array of encoded source code sequences of shape (n,seq_length).

        y : np.array
            An array of optimal device mappings of shape (n,1).
        
        y_1hot : np.array
            An array of optimal device mappings of shape (n,2), in 1-hot encoding.
            
        verbose: bool, optional
            Whether to print verbose status messages during training.
        """
        raise NotImplementedError

    def predict(self, features: np.array, sequences: np.array, y: np.array,
                y_1hot: np.array, verbose: bool=False) -> np.array:
        """
        Make predictions for programs.
        
        Parameters
        ----------
        features : np.array
            An array of feature vectors of shape (n,4).
        
        sequences : np.array
            An array of encoded source code sequences of shape (n,seq_length).

        y : np.array
            An array of optimal device mappings of shape (n,1).
        
        y_1hot : np.array
            An array of optimal device mappings of shape (n,2), in 1-hot encoding.
            
        verbose: bool, optional
            Whether to print verbose status messages.
            
        Returns
        -------
        np.array
            Predicted 'y' values (optimal device mappings) with shape (n,1).
        """
        raise NotImplementedError

In [10]:
class DeepTune(HeterogemeousMappingModel):
    __name__ = "DeepTune"
    __basename__ = "deeptune"
    
    def init(self, seed: int):
        from keras.layers import Input, Embedding, LSTM, Dense, Conv1D, GlobalMaxPooling1D
        from keras.layers.merge import Concatenate
        from keras.layers.normalization import BatchNormalization
        from keras.models import Model

        np.random.seed(seed)
        
        # Language model. Takes as inputs source code sequences.
        code_in = Input(shape=(1024,), dtype="int32", name="code_in")
        x = Embedding(input_dim=atomizer.vocab_size + 1, input_length=1024, 
                      output_dim=64, name="embedding")(code_in)
        # x = LSTM(64, implementation=1, return_sequences=True, name="lstm_1")(x)
        # x = LSTM(64, implementation=1, name="lstm_2")(x)
        x = Conv1D(32, 9, name="conv_1")(x)
        x = Conv1D(32, 9, name="conv_2")(x)
        # x = Conv1D(32, 9, name="conv_3")(x)
        x = GlobalMaxPooling1D()(x)
        langmodel_out = Dense(2, activation="sigmoid")(x)
        
        # Auxiliary inputs. wgsize and dsize.
        auxiliary_inputs = Input(shape=(2,))
        
        # Heuristic model. Takes as inputs the language model,
        #   outputs 1-hot encoded device mapping
        x = Concatenate()([auxiliary_inputs, x])
        x = BatchNormalization()(x)
        x = Dense(32, activation="relu")(x)
        out = Dense(2, activation="sigmoid")(x)

        self.model = Model(inputs=[auxiliary_inputs, code_in], outputs=[out, langmodel_out])
        self.model.compile(
            optimizer="adam", metrics=['accuracy'],
            loss=["categorical_crossentropy", "categorical_crossentropy"],
            loss_weights=[1., .2])
        
        return self

    def save(self, outpath):
        self.model.save(outpath)

    def restore(self, inpath):
        from keras.models import load_model
        self.model = load_model(inpath)
        
    def train(self, **train):
        self.model.fit([train["aux_in"], train["sequences"]], [train["y_1hot"], train["y_1hot"]],
                       epochs=50, batch_size=64, verbose=train["verbose"], shuffle=True)

    def predict(self, **test):
        p = np.array(self.model.predict(
            [test["aux_in"], test["sequences"]], batch_size=64, verbose=test["verbose"]))
        indices = [np.argmax(x) for x in p[0]]
        return indices

    def predict_demo(self, **test):
        p = np.array(self.model.predict(
            [test["aux_in"], test["sequences"]], batch_size=1, verbose=test["verbose"]))
        indices = [np.argmax(x) for x in p[0]]
        return indices

### Prediction

In [11]:
srcs = '\n'.join(pd.read_csv("../data/case-study-a/cgo17-amd.csv")['src'].values)
atomizer = clgen.GreedyAtomizer.from_text(lang=Language.from_str("opencl"), text=srcs)
atomizer

GreedyAtomizer[128 tokens]

In [12]:
seed = 204
deeptune_model = DeepTune()
deeptune_model.init(seed);





Using TensorFlow backend.



Instructions for updating:
keep_dims is deprecated, use keepdims instead

Instructions for updating:
keep_dims is deprecated, use keepdims instead



In [13]:
from labm8 import fs
import pickle

def evaluate(model: HeterogemeousMappingModel) -> pd.DataFrame:
    """
    Evaluate a model.
    
    Performs 10-fold cross-validation of the model's effectiveness at predicting
    OpenCL device mappings. Results are cached.
    
    Parameters
    ----------
    model : HeterogeneousMappingModel
        The predictive model to evaluate.
        
    Returns
    -------
    pd.Dataframe
        Evaluation results.
    """
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import train_test_split
    from progressbar import ProgressBar
    
    progressbar = [0, ProgressBar(max_value=10*2)]

    data = []
    for i, platform in enumerate(["amd", "nvidia"]):
        platform_name = platform2str(platform)

        # load runtime data
        df = pd.read_csv(f"../data/case-study-a/cgo17-{platform}.csv")
        
        sequences = None  # defer sequence encoding until needed (it's expensive)
        
        # values used for training & predictions
        features = grewe_features(df)
        aux_in = auxiliary_inputs(df)
        
        # optimal mappings
        y = np.array([1 if x == "GPU" else 0 for x in df["oracle"].values])
        y_1hot = encode_1hot(y)
        
        # 10-fold cross-validation
        # kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
        # for j, (train_index, test_index) in enumerate(kf.split(features, y)):
        indices = np.arange(len(features))
        train_index, test_index = train_test_split(indices, shuffle=True, random_state=seed, test_size=0.25)

        model_path = f"../data/case-study-a/models/{model.__basename__}-{platform}-demo.model"
        predictions_path = f"../data/case-study-a/predictions/{model.__basename__}-{platform}-demo.result"
        
        if False:
        # if fs.exists(predictions_path):
            # load result from cache
            with open(predictions_path, 'rb') as infile:
                p = pickle.load(infile)
        else:
            if sequences is None:  # encode source codes if needed
                sequences = encode_srcs(df["src"].values)
            
            if False:
            # if fs.exists(model_path):
                # restore trained model from cache
                model.restore(model_path)
            else:
                # train and cache a model
                model.init(seed=seed)                   
                model.train(df=df,
                            features=features[train_index],
                            aux_in=aux_in[train_index],
                            sequences=sequences[train_index],
                            y=y[train_index],
                            y_1hot=y_1hot[train_index],
                            verbose=True)
                fs.mkdir(fs.dirname(model_path))
                model.save(model_path)

            # test model
            p = model.predict(
                features=features[test_index],
                aux_in=aux_in[test_index],
                sequences=sequences[test_index],
                y=y[test_index],
                y_1hot=y_1hot[test_index],
                verbose=True)

        # cache results
        fs.mkdir(fs.dirname(predictions_path))
        with open(predictions_path, 'wb') as outfile:
            pickle.dump(p, outfile)

        # benchmarks
        benchmarks = df['benchmark'].values[test_index]
        # oracle device mappings
        o = y[test_index]
        # whether predictions were correct or not
        correct = p == o
        # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA)
        zero_r_dev = "runtime_cpu" if platform == "amd" else "runtime_gpu"
        zer_r_runtimes = df[zero_r_dev][test_index]
        # speedups of predictions
        runtimes = df[['runtime_cpu', 'runtime_gpu']].values[test_index]
        p_runtimes = [r[p_] for p_, r in zip(p, runtimes)]
        p_speedup = zer_r_runtimes / p_runtimes

        # sanity check
        assert(len(benchmarks) == len(o) == len(correct) == len(p) == len(p_speedup))

        # record results
        for benchmark_, o_, p_, correct_, p_speedup_ in zip(benchmarks, o, p, correct, p_speedup):
            data.append({
                "Model": model.__name__,
                "Platform": platform_name,
                'Benchmark': escape_benchmark_name(benchmark_),
                'Benchmark Suite': escape_suite_name(benchmark_),
                "Oracle Mapping": o_,
                "Predicted Mapping": p_,
                "Correct?": correct_,
                "Speedup": p_speedup_,
            })
        
        # update progress bar
        progressbar[0] += 1
        progressbar[1].update(progressbar[0])

    return pd.DataFrame(
        data, index=range(1, len(data)+1), columns=[
            "Model",
            "Platform",
            "Benchmark",
            "Benchmark Suite", 
            "Oracle Mapping", 
            "Predicted Mapping", 
            "Correct?", 
            "Speedup"
        ])

In [14]:
import sys
print("Evaluating DeepTune ...", file=sys.stderr)
deeptune = evaluate(deeptune_model)
deeptune.groupby(['Platform', 'Benchmark Suite'])['Platform', 'Correct?', 'Speedup'].mean()

Evaluating DeepTune ...


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Epoch 1/50


2022-12-14 14:18:33.048664: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-12-14 14:18:33.070232: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2808000000 Hz
2022-12-14 14:18:33.077378: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x66c4a00 executing computations on platform Host. Devices:
2022-12-14 14:18:33.077545: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  0% (0 of 20) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  after removing the cwd from sys.path.


Unnamed: 0_level_0,Unnamed: 1_level_0,Correct?,Speedup
Platform,Benchmark Suite,Unnamed: 2_level_1,Unnamed: 3_level_1
AMD Tahiti 7970,AMD SDK,0.714286,1.037145
AMD Tahiti 7970,NPB,0.787879,3.095299
AMD Tahiti 7970,NVIDIA SDK,0.75,3.736255
AMD Tahiti 7970,Parboil,0.833333,13.612841
AMD Tahiti 7970,Polybench,1.0,2.408824
AMD Tahiti 7970,Rodinia,0.285714,5.66123
AMD Tahiti 7970,SHOC,0.888889,0.979293
NVIDIA GTX 970,AMD SDK,0.571429,0.742213
NVIDIA GTX 970,NPB,0.833333,1.462055
NVIDIA GTX 970,NVIDIA SDK,0.75,1.107079


In [15]:
# def predict(self, **test):
#         p = np.array(self.model.predict(
#             [test["aux_in"], test["sequences"]], batch_size=64, verbose=test["verbose"]))
#         indices = [np.argmax(x) for x in p[0]]
#         return indices

In [21]:
srcs = [rewritten]
srcs = encode_srcs(srcs)

df = pd.read_csv(f"../data/case-study-a/cgo17-amd.csv")
df_row = df.iloc[[630]]

features = grewe_features(df_row)
aux_in = auxiliary_inputs(df_row)

p = deeptune_model.predict_demo(aux_in=aux_in, sequences=srcs, verbose=True)
mapping = {
    0: "CPU",
    1: "GPU"
}
print("Predicted:", mapping[p[0]])

Predicted: CPU


In [22]:
print(df_row)

     Unnamed: 0                                benchmark  dataset  comp  \
630         630  rodinia-3.1-streamcluster-memset_kernel  default     0   

     rational  mem  localmem  coalesced  atomic  transfer  wgsize oracle  \
630         0    1         0          1       0  80543744     256    CPU   

     runtime_cpu  runtime_gpu  \
630   777.894567  1141.490884   

                                                   src  \
630  __kernel void A(__global char* a, short b, int...   

                                seq  
630  [129 129 129 ...,  26   0 127]  
