In [1]:
import multiprocessing
import os
from glob import glob
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer,
    GPT2LMHeadModel,
    AutoConfig,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

In [2]:
num_cores_avail = max(1, multiprocessing.cpu_count() - 1)

# Data

In [3]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [4]:
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

In [5]:
print(
    any_keyword_in_string(example_1, filters),
    any_keyword_in_string(example_2, filters)
)

False True


In [6]:
def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    post_filt_prop = round(len(filtered_dict['content']) / total, ndigits=2)
    print(f"{post_filt_prop * 100}% of data after filtering")
    return Dataset.from_dict(filtered_dict)

In [7]:
dataset_checkpoint = f"transformersbook/codeparrot"
dataset_commit_id = "0933803eb0f5956b2da9d2d7b6805fa31b18a6c8"

In [8]:
split = "train"
data = load_dataset(f"{dataset_checkpoint}-{split}", revision=dataset_commit_id, split=split, streaming=True)

Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/183 [00:00<?, ?it/s]

In [9]:
# Skip this because it takes quite a while
# filtered_data = filter_streaming_dataset(data, filters)

In [10]:
prepped_dataset_checkpoint = "huggingface-course/codeparrot-ds"

In [11]:
ds_train = load_dataset(f"{prepped_dataset_checkpoint}-train", split="train")
ds_valid = load_dataset(f"{prepped_dataset_checkpoint}-valid", split="validation")

In [12]:
raw_datasets = DatasetDict({
    "train": ds_train,
    "valid": ds_valid
})

In [13]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 606720
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

In [14]:
n_train_samp = int(0.01 * raw_datasets["train"].num_rows)
n_valid_samp = int(0.2 * raw_datasets["valid"].num_rows)

In [15]:
raw_datasets_mini = DatasetDict()
raw_datasets_mini["train"] = raw_datasets["train"].shuffle(seed=42).select(range(n_train_samp))
raw_datasets_mini["valid"] = raw_datasets["valid"].shuffle(seed=42).select(range(n_valid_samp))

In [16]:
raw_datasets_mini

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 6067
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 664
    })
})

In [17]:
for k, v in raw_datasets_mini["train"][0].items():
    print(f"{k.upper()}: {v[:200]}")

REPO_NAME: ThomasMiconi/htmresearch
PATH: projects/feedback/feedback_sequences.py
COPIES: 2
SIZE: 26875
CONTENT: 
# Numenta Platform for Intelligent Computing (NuPIC)
# Copyright (C) 2016, Numenta, Inc.  Unless you have an agreement
# with Numenta, Inc., for a separate license for this software code, the
# follo
LICENSE: agpl-3.0


# Tokenization

In [18]:
tokenizer_checkpoint = "huggingface-course/code-search-net-tokenizer"
tokenizer_commit_id = "2a84d6753fdeb105c5e2e9a6be952f119216a991"

In [19]:
context_length = 128
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint, revision=tokenizer_commit_id)

In [20]:
outputs = tokenizer(
    raw_datasets_mini["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True
)

In [21]:
print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 86
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 17, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 2]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [22]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [23]:
dataset_save_dir = "../temp/07/codeparrot-ds-tokenized-dataset"

In [24]:
if os.path.isdir(f"{dataset_save_dir}"):
    tokenized_datasets = load_from_disk(dataset_save_dir)
else:
    tokenized_datasets = raw_datasets_mini.map(
        tokenize, batched=True, remove_columns=raw_datasets_mini["train"].column_names
    )
    tokenized_datasets.save_to_disk(dataset_save_dir)

Map:   0%|          | 0/6067 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/165469 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19726 [00:00<?, ? examples/s]

In [25]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 165469
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 19726
    })
})

# Model

In [26]:
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

In [27]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())

In [28]:
print(f"GPT-2 size: {round(model_size / (1e6), 1)}M params")

GPT-2 size: 124.2M params


In [29]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [30]:
outputs = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for k, v in outputs.items():
    print(f"{k} shape: {v.shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


## Training

In [31]:
model_output_dir = "../temp/07/codeparrot-ds-finetuned"

In [32]:
args = TrainingArguments(
    output_dir=model_output_dir,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    save_total_limit=3,
    fp16=True,
    push_to_hub=False,
)

In [33]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"]
)

In [34]:
trainer.train()



Step,Training Loss,Validation Loss


KeyboardInterrupt: 