# The Language Model

## 1. Demonstration

### Source Rewriter

Given the following example code (taken from Nvidia’s streamcluster benchmark):

In [None]:
code = """//#define Elements
__kernel void memset_kernel(__global char * mem_d, short val, int number_bytes){
    const int thread_id = get_global_id(0);
    mem_d[thread_id] = val;
}"""

print(code)

We apply the rewriter. Variable and function names are normalized, comments removed, and code style enforced:

In [None]:
from clgen import preprocess

rewritten = preprocess(code)
print(rewritten)

### Source Encoder

Deriving a 1-of-$k$ vocabulary for a piece of code, using a hybrid character and token based approach:

In [None]:
from clgen._atomizer import GreedyAtomizer
from clgen._langs import Language


atomizer = GreedyAtomizer.from_text(lang=Language.from_str("opencl"), text=rewritten)
print(atomizer)

The derived vocabulary maps tokens to indices:

In [None]:
import pandas as pd

pd.DataFrame(sorted([f"'{k}'" for k in atomizer.vocab]), columns=["token"])

Encoding the source using this vocabulary yields:

In [None]:
encoded = atomizer.atomize(rewritten)
print(encoded)

Reversing the process:

In [None]:
for i in encoded:
    t = atomizer.deatomize([i])
    if t == '\n': t = '\\n'
    print(f"<{t}>", end="")

### Padding

Sequences are padded to a fixed length using an out-of-vocabulary token:

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

pad_val = atomizer.vocab_size
print(pad_sequences([encoded], maxlen=len(encoded) + 22, value=pad_val)[0])

## 2. Deriving Vocabulary from Handwritten GPGPU Benchmarks

For the experiments in the paper, we derived a vocabulary from 45k lines of real world handwritten GPGPU benchmarks:

In [None]:
import pandas as pd

srcs = '\n'.join(pd.read_csv("../data/case-study-a/cgo17-amd.csv")['src'].values)
print("lines of code:", len(srcs.split('\n')))

derived_atomizer = GreedyAtomizer.from_text(lang=Language.from_str("opencl"), text=srcs)
print("derived vocabulary:", derived_atomizer)

In [None]:
pd.set_option('display.max_rows', 10)
pd.DataFrame(sorted([f"'{k}'" for k in derived_atomizer.vocab]), columns=["token"])

Using this derived vocabulary, lets plot the first 80 tokens of 12 real world GPU benchmarks:

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from labm8 import viz

natoms = 80
nprog = 12

data = [derived_atomizer.atomize(src)[:natoms] for src in
        pd.read_csv("../data/case-study-a/cgo17-amd.csv")['src'].values[:nprog]]
kernels = [x.split("-")[-1] for x in
           pd.read_csv("../data/case-study-a/cgo17-amd.csv")['benchmark'].values[:nprog]]
data = np.reshape(data, (nprog, natoms))
    
ax = sns.heatmap(data, vmin=0, vmax=derived_atomizer.vocab_size, square=True,
                 cbar=False, yticklabels=kernels, xticklabels=[str(x) for x in range(1, natoms + 1)])
plt.title("Encoded GPU benchmark sources (first 80 tokens)")
viz.finalise(figsize=(15, 5))

Notice that, as a result of the rewriting process, each of the kernels starts in an identical manner, i.e. `__kernel void A(...`

# 3. Reproduce LaTeX Tables from the Paper

Order the atoms by their appearance in the example code:

In [None]:
i = 1
s = set()
ordered = []
for idx in derived_atomizer.atomize(rewritten):
    if idx not in s:
        t = derived_atomizer.decoder[idx]
        ordered.append((idx, t, i))
        i += 1
        s.add(idx)

Table 3c:

In [None]:
print("""
\\footnotesize
\\begin{tabular}{l l | l l | l l}
  \\toprule\
""")
print(" ", " & ".join(["\\textbf{idx} & \\textbf{token}"] * 3), "\\\\")
print("  \\midrule")

def escape(x):
    import re
    if x == '\n':
        return '\\textbackslash n'
    elif x == '{':
        return '\\{'
    elif x == '}':
        return '\\}'
    else:
        return re.sub(r'_', '\\_', x)

for i, (j, k, l) in enumerate(zip(ordered[:9], ordered[9:18], ordered[18:])):
    i1, i2, i3 = i + 1, i + 10, i + 19
    c1, c2, c3 = escape(j[1]), escape(k[1]), escape(l[1])
    print(f"""  \
\\texttt{{{i1}}} & \\texttt{{`{c1}'}} & \
\\texttt{{{i2}}} & \\texttt{{`{c2}'}} & \
\\texttt{{{i3}}} & \\texttt{{`{c3}'}} \\\\\
""")
    
print("""\
  \\bottomrule
\\end{tabular}""")

Table 3d:

In [None]:
translator = dict((i1, i2) for i1, _, i2 in ordered)

print("""\
\\rowcolors{2}{white}{gray!25}
\\footnotesize
\\begin{tabular}{l l l l l l l l l l l}
  \\toprule
  """, end="")

for i, idx in enumerate(derived_atomizer.atomize(rewritten)):
    t = translator[idx]
    print(f"\\texttt{{{t:02d}}}", end="")
    if (i + 1) % 11:
        print(" & ", end="")
    else:
        print(" \\\\\n  ", end="")

print("""\
\\multicolumn{2}{l}{\\texttt{<pad\\ldots}>} \\\\""")
print("""\
  \\bottomrule
\\end{tabular}""")


End of experiments.