In [39]:
import multiprocessing
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoTokenizer

In [44]:
num_cores_avail = max(1, multiprocessing.cpu_count() - 1)

# Data

In [2]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [3]:
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

In [4]:
print(
    any_keyword_in_string(example_1, filters),
    any_keyword_in_string(example_2, filters)
)

False True


In [5]:
def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    post_filt_prop = round(len(filtered_dict['content']) / total, ndigits=2)
    print(f"{post_filt_prop * 100}% of data after filtering")
    return Dataset.from_dict(filtered_dict)

In [6]:
dataset_checkpoint = f"transformersbook/codeparrot"
dataset_commit_id = "0933803eb0f5956b2da9d2d7b6805fa31b18a6c8"

In [7]:
split = "train"
data = load_dataset(f"{dataset_checkpoint}-{split}", revision=dataset_commit_id, split=split, streaming=True)

Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/183 [00:00<?, ?it/s]

In [8]:
# Skip this because it takes quite a while
# filtered_data = filter_streaming_dataset(data, filters)

In [9]:
prepped_dataset_checkpoint = "huggingface-course/codeparrot-ds"

In [10]:
ds_train = load_dataset(f"{prepped_dataset_checkpoint}-train", split="train")
ds_valid = load_dataset(f"{prepped_dataset_checkpoint}-valid", split="validation")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.25G [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/46.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [11]:
raw_datasets = DatasetDict({
    "train": ds_train,
    "valid": ds_valid
})

In [17]:
n_train_samp = int(0.1 * raw_datasets["train"].num_rows)
n_valid_samp = int(0.2 * raw_datasets["valid"].num_rows)

In [18]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 606720
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

In [20]:
raw_datasets_mini = DatasetDict()
raw_datasets_mini["train"] = raw_datasets["train"].shuffle(seed=42).select(range(n_train_samp))
raw_datasets_mini["valid"] = raw_datasets["valid"].shuffle(seed=42).select(range(n_valid_samp))

In [21]:
raw_datasets_mini

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 60672
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 664
    })
})

In [24]:
for k, v in raw_datasets_mini["train"][0].items():
    print(f"{k.upper()}: {v[:200]}")

REPO_NAME: ThomasMiconi/htmresearch
PATH: projects/feedback/feedback_sequences.py
COPIES: 2
SIZE: 26875
CONTENT: 
# Numenta Platform for Intelligent Computing (NuPIC)
# Copyright (C) 2016, Numenta, Inc.  Unless you have an agreement
# with Numenta, Inc., for a separate license for this software code, the
# follo
LICENSE: agpl-3.0


# Tokenization

In [27]:
tokenizer_checkpoint = "huggingface-course/code-search-net-tokenizer"
tokenizer_commit_id = "2a84d6753fdeb105c5e2e9a6be952f119216a991"

In [28]:
context_length = 128
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint, revision=tokenizer_commit_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

Downloading (…)19216a991/vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

Downloading (…)19216a991/merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

Downloading (…)6a991/tokenizer.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [33]:
outputs = tokenizer(
    raw_datasets_mini["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True
)

In [34]:
print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 86
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 17, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 2]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [47]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [48]:
tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets_mini["train"].column_names
)

Map:   0%|          | 0/606720 [00:00<?, ? examples/s]

KeyboardInterrupt: 