In [1]:
import multiprocessing
import os
from glob import glob
from collections import defaultdict
from tqdm import tqdm

import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from accelerate import Accelerator
from datasets import Dataset, load_dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer,
    GPT2LMHeadModel,
    AutoConfig,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline,
    get_scheduler
)

In [2]:
num_cores_avail = max(1, multiprocessing.cpu_count() - 1)

# Data

In [3]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [4]:
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

In [5]:
print(
    any_keyword_in_string(example_1, filters),
    any_keyword_in_string(example_2, filters)
)

False True


In [6]:
def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    post_filt_prop = round(len(filtered_dict['content']) / total, ndigits=2)
    print(f"{post_filt_prop * 100}% of data after filtering")
    return Dataset.from_dict(filtered_dict)

In [7]:
dataset_checkpoint = f"transformersbook/codeparrot"
dataset_commit_id = "0933803eb0f5956b2da9d2d7b6805fa31b18a6c8"

In [8]:
split = "train"
data = load_dataset(f"{dataset_checkpoint}-{split}", revision=dataset_commit_id, split=split, streaming=True)

Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/183 [00:00<?, ?it/s]

In [9]:
# Skip this because it takes quite a while
# filtered_data = filter_streaming_dataset(data, filters)

In [10]:
prepped_dataset_checkpoint = "huggingface-course/codeparrot-ds"

In [11]:
ds_train = load_dataset(f"{prepped_dataset_checkpoint}-train", split="train")
ds_valid = load_dataset(f"{prepped_dataset_checkpoint}-valid", split="validation")

In [12]:
raw_datasets = DatasetDict({
    "train": ds_train,
    "valid": ds_valid
})

In [13]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 606720
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

In [14]:
n_train_samp = int(0.0025 * raw_datasets["train"].num_rows)
n_valid_samp = int(0.1 * raw_datasets["valid"].num_rows)

In [15]:
raw_datasets_mini = DatasetDict()
raw_datasets_mini["train"] = raw_datasets["train"].shuffle(seed=42).select(range(n_train_samp))
raw_datasets_mini["valid"] = raw_datasets["valid"].shuffle(seed=42).select(range(n_valid_samp))

In [16]:
raw_datasets_mini

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 1516
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 332
    })
})

In [17]:
for k, v in raw_datasets_mini["train"][0].items():
    print(f"{k.upper()}: {v[:200]}")

REPO_NAME: ThomasMiconi/htmresearch
PATH: projects/feedback/feedback_sequences.py
COPIES: 2
SIZE: 26875
CONTENT: 
# Numenta Platform for Intelligent Computing (NuPIC)
# Copyright (C) 2016, Numenta, Inc.  Unless you have an agreement
# with Numenta, Inc., for a separate license for this software code, the
# follo
LICENSE: agpl-3.0


# Tokenization

In [18]:
tokenizer_checkpoint = "huggingface-course/code-search-net-tokenizer"
tokenizer_commit_id = "2a84d6753fdeb105c5e2e9a6be952f119216a991"

In [19]:
context_length = 128
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint, revision=tokenizer_commit_id)

In [20]:
outputs = tokenizer(
    raw_datasets_mini["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True
)

In [21]:
print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 86
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 17, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 2]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [22]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [23]:
dataset_save_dir = "../temp/07/codeparrot-ds-tokenized-dataset"

In [24]:
if os.path.isdir(f"{dataset_save_dir}"):
    tokenized_datasets = load_from_disk(dataset_save_dir)
else:
    tokenized_datasets = raw_datasets_mini.map(
        tokenize, batched=True, remove_columns=raw_datasets_mini["train"].column_names
    )
    tokenized_datasets.save_to_disk(dataset_save_dir)

In [25]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 39588
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 8898
    })
})

# Model

In [26]:
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

In [27]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())

In [28]:
print(f"GPT-2 size: {round(model_size / (1e6), 1)}M params")

GPT-2 size: 124.2M params


In [29]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [30]:
outputs = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for k, v in outputs.items():
    print(f"{k} shape: {v.shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


## Training

In [31]:
model_output_dir = "../temp/07/codeparrot-ds-finetuned"

In [32]:
args = TrainingArguments(
    output_dir=model_output_dir,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=50,
    save_total_limit=3,
    fp16=True,
    push_to_hub=False,
)

In [33]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"]
)

In [34]:
# trainer.train()

## Pipeline

In [35]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [36]:
model_checkpoints = [d for d in os.listdir(model_output_dir) if "checkpoint" in d]
model_checkpoints = sorted(model_checkpoints, key=lambda x: int(x.split("-")[-1]))
latest_model_checkpoint = model_checkpoints[-1]

In [37]:
latest_model_checkpoint

'checkpoint-150'

In [38]:
gen_pipeline = pipeline(
    "text-generation", model=f"{model_output_dir}/{latest_model_checkpoint}", device=device
)

In [39]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""

In [40]:
print(gen_pipeline(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
################################	)

#
# for the X [1


In [41]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(gen_pipeline(txt, num_return_sequences=1)[0]["generated_text"])


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
# -1.0.k_2..c = np


In [42]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(gen_pipeline(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
#
# the test


In [43]:
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(gen_pipeline(txt, num_return_sequences=1)[0]["generated_text"])


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:





import matplotlib.0

# and the :


# Using Accelerate for custom training loop

In [44]:
keytoken_ids = []
for keyword in [
    "plt",
    "pd",
    "sk",
    "fit",
    "predict",
    " plt",
    " pd",
    " sk",
    " fit",
    " predict",
    "testtest",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword does not have single token: {keyword}")


Keyword does not have single token: testtest


In [45]:
def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift tokens, < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()

    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(
        shift_logits.view(-1, shift_logits.size(-1)),
        shift_labels.view(-1)
    )
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([
        (inputs == kt).float() for kt in keytoken_ids
    ]).sum(axis=[0, 2])
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

In [46]:
tokenized_datasets.set_format("torch")
# train_dataloader = DataLoader(tokenized_datasets["train"].select(range(100)), batch_size=32, shuffle=True)
# valid_dataloader = DataLoader(tokenized_datasets["valid"].select(range(100)), batch_size=32)
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)
valid_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=32)

In [47]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0}
    ]

In [48]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(valid_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])
            if outputs.loss.dim() == 0:
                outputs.loss = outputs.loss.unsqueeze(0)

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [49]:
# Rebuild model from scratch
model = GPT2LMHeadModel(config)
optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [50]:
accelerator = Accelerator(mixed_precision="fp16")
model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, valid_dataloader
)

In [51]:
num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps
)

In [52]:
model_accel_output_dir = "../temp/07/codeparrot-ds-accelerate-finetuned/"

In [53]:
evaluate()

(11.030363082885742, 61719.98828125)

In [54]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 100

model.train()
completed_steps = 0

for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print({
                "lr": lr_scheduler.get_lr(),
                "steps": completed_steps,
                "loss/train": loss.item() * gradient_accumulation_steps,
            })
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % eval_steps * gradient_accumulation_steps) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(model_accel_output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(model_accel_output_dir)

  0%|          | 0/1238 [00:00<?, ?it/s]



{'lr': [6e-06, 6e-06], 'steps': 12, 'loss/train': 131.64871215820312}
{'loss/eval': 9.841143608093262, 'perplexity': 18791.193359375}
{'lr': [1.2e-05, 1.2e-05], 'steps': 24, 'loss/train': 106.70005798339844}
{'loss/eval': 9.283154487609863, 'perplexity': 10755.3056640625}
{'lr': [1.85e-05, 1.85e-05], 'steps': 37, 'loss/train': 126.12361145019531}
{'loss/eval': 9.041620254516602, 'perplexity': 8447.453125}
{'lr': [2.4500000000000003e-05, 2.4500000000000003e-05], 'steps': 49, 'loss/train': 94.83934020996094}
{'loss/eval': 8.54878044128418, 'perplexity': 5160.45703125}
{'lr': [3.1e-05, 3.1e-05], 'steps': 62, 'loss/train': 89.09877014160156}
{'loss/eval': 8.021772384643555, 'perplexity': 3046.572265625}
{'lr': [3.7e-05, 3.7e-05], 'steps': 74, 'loss/train': 113.5152359008789}
{'loss/eval': 7.552396774291992, 'perplexity': 1905.3038330078125}
{'lr': [4.35e-05, 4.35e-05], 'steps': 87, 'loss/train': 86.49349975585938}
{'loss/eval': 7.156615734100342, 'perplexity': 1282.56298828125}
{'lr': [4.9