In [11]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Download data

In [2]:
raw_datasets = load_dataset("code_search_net", "python")

Downloading builder script:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

In [3]:
raw_datasets["train"]

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})

In [5]:
print(raw_datasets["train"][21345]["whole_func_string"])

def _send(self, line):
        """
        Write a line of data to the server.

        Args:
        line -- A single line of data to write to the socket.

        """
        if not line.endswith('\r\n'):
            if line.endswith('\n'):
                logger.debug('Fixing bare LF before sending data to socket')
                line = line[0:-1] + '\r\n'
            else:
                logger.debug(
                    'Fixing missing CRLF before sending data to socket')
                line = line + '\r\n'
        logger.debug('Client sent: ' + line.rstrip())
        self._socket.send(line)


# Prep data

In [8]:
training_corpus = (
    raw_datasets["train"][i: i + 1000]["whole_func_string"]
    for i in range(0, len(raw_datasets["train"]), 1000)
)

In [9]:
# Generators can be consumed (i.e., are only usable once)
def get_training_corpus():
    return (
        raw_datasets["train"][i: i + 1000]["whole_func_string"]
        for i in range(0, len(raw_datasets["train"]), 1000)
    )

In [10]:
training_corpus = get_training_corpus()

# Train a new tokenizer

In [12]:
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [13]:
example = '''def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b'''

In [15]:
print(example)

def add_numbers(a, b):
    """Add the two numbers `a` and `b`."""
    return a + b


In [28]:
tokens1 = old_tokenizer.tokenize(example)

In [29]:
print(tokens1)

['def', 'Ġadd', '_', 'n', 'umbers', '(', 'a', ',', 'Ġb', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`', '."', '""', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


In [19]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)






In [30]:
tokens2 = tokenizer.tokenize(example)

In [31]:
print(tokens2)

['def', 'Ġadd', '_', 'numbers', '(', 'a', ',', 'Ġb', '):', 'ĊĠĠĠ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`."""', 'ĊĠĠĠ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']


In [35]:
len(tokens1)

36

In [36]:
len(tokens2)

27

In [38]:
# Does the tokenizer still know how to do the original task?
example = "How often do you eat breakfast in the morning?"
print(tokenizer.tokenize(example))

['How', 'Ġoften', 'Ġdo', 'Ġyou', 'Ġeat', 'Ġbreak', 'fast', 'Ġin', 'Ġthe', 'Ġmor', 'ning', '?']


In [39]:
tokenizer.vocab_size

52000

In [40]:
old_tokenizer.vocab_size

50257

In [42]:
example = """class LinearLayer():
    def __init__(self, input_size, output_size):
        self.weight = torch.randn(input_size, output_size)
        self.bias = torch.zeros(output_size)

    def __call__(self, x):
        return x @ self.weights + self.bias
    """

In [43]:
print(tokenizer.tokenize(example))

['class', 'ĠLinear', 'Layer', '():', 'ĊĠĠĠ', 'Ġdef', 'Ġ__', 'init', '__(', 'self', ',', 'Ġinput', '_', 'size', ',', 'Ġoutput', '_', 'size', '):', 'ĊĠĠĠĠĠĠĠ', 'Ġself', '.', 'weight', 'Ġ=', 'Ġtorch', '.', 'randn', '(', 'input', '_', 'size', ',', 'Ġoutput', '_', 'size', ')', 'ĊĠĠĠĠĠĠĠ', 'Ġself', '.', 'bias', 'Ġ=', 'Ġtorch', '.', 'zeros', '(', 'output', '_', 'size', ')', 'ĊĊĠĠĠ', 'Ġdef', 'Ġ__', 'call', '__(', 'self', ',', 'Ġx', '):', 'ĊĠĠĠĠĠĠĠ', 'Ġreturn', 'Ġx', 'Ġ@', 'Ġself', '.', 'weights', 'Ġ+', 'Ġself', '.', 'bias', 'ĊĠĠĠĠ']


In [44]:
tokenizer.save_pretrained("../temp/code-search-net-tokenizer")

('../temp/code-search-net-tokenizer/tokenizer_config.json',
 '../temp/code-search-net-tokenizer/special_tokens_map.json',
 '../temp/code-search-net-tokenizer/vocab.json',
 '../temp/code-search-net-tokenizer/merges.txt',
 '../temp/code-search-net-tokenizer/added_tokens.json',
 '../temp/code-search-net-tokenizer/tokenizer.json')