# Find tokens in training data

This script finds the origin of tokens in training data. It is set up for the StarCoder2 model

In [1]:
import datasets
from ipywidgets import interact

In [2]:
ds = datasets.load_dataset("bigcode/the-stack-march-sample-special-tokens-stripped")

In [3]:

phrases = ['ittrLoremipum','lcsStatusWlan','ucMZQg','mJNUZLIEMNV','GQGantt','BjKPZFq','INCEXPRS','ELABSCOPES',' cKVisitor','tableOBJECT',' simpleIndexQueryParserTests', 'trimBalanco', 'awsAwsjson', 'Baseldytsch', 'Ostschwizertütsch',' BibleDownload', 'ProrrateoImpor','PoliticaProducto']
finds = {p: [] for p in phrases}
for r in ds['train']:
    for p in phrases:
        if p in r['content']:
            finds[p].append(r)


In [4]:
hashes = {p: hex(abs(sum(hash(s['content']) for s in finds[p])))[2:8] for p in phrases}
phrases = sorted(phrases, key=lambda p: (len(finds[p]),hashes[p]))
# group phrases derived from the same docs
for p in phrases:
    count = sum(s['content'].count(p) for s in finds[p])
    print(f"Phrase {repr(p):<40} found in {len(finds[p])}/{len(ds['train'])} samples\t  hash {hashes[p]}\t  {count} occurrences")


Phrase 'Baseldytsch'                            found in 1/746856 samples	  hash 130ed4	  8826 occurrences
Phrase 'Ostschwizertütsch'                      found in 1/746856 samples	  hash 130ed4	  2864 occurrences
Phrase 'GQGantt'                                found in 1/746856 samples	  hash 13874b	  1101 occurrences
Phrase 'PoliticaProducto'                       found in 1/746856 samples	  hash 20bbf3	  1048 occurrences
Phrase 'INCEXPRS'                               found in 1/746856 samples	  hash 342c42	  1024 occurrences
Phrase 'ELABSCOPES'                             found in 1/746856 samples	  hash 342c42	  1640 occurrences
Phrase ' cKVisitor'                             found in 1/746856 samples	  hash 3781c4	  884 occurrences
Phrase ' BibleDownload'                         found in 1/746856 samples	  hash 37e3f4	  1318 occurrences
Phrase 'ittrLoremipum'                          found in 1/746856 samples	  hash 48a2db	  965 occurrences
Phrase 'ucMZQg'                        

In [5]:
@interact(i=(1,11), phrase=phrases)
def show_samples(phrase, i=1):
    if i > len(finds[phrase]):
        print("No more samples")
        return
    content = finds[phrase][i-1]['content']
    lines = content.count('\n') + 1
    print(f"Showing sample #{i}/{len(finds[phrase])} for {phrase}, which appears {content.count(phrase)} times in {lines} lines\n-----------------------\n{content}")


interactive(children=(Dropdown(description='phrase', options=('Baseldytsch', 'Ostschwizertütsch', 'GQGantt', '…