<a href="https://colab.research.google.com/github/dasneelay/360Project/blob/main/HfPractice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import datasets
import torch
import transformers
import accelerate
from transformers import GPT2LMHeadModel

In [3]:
ds_train = datasets.load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = datasets.load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

In [56]:
raw_datasets = datasets.DatasetDict(
    {
        "train": ds_train,
        "valid": ds_valid
    }
)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 606720
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

In [57]:
for feature in raw_datasets["train"][0]:
  print(f"{feature}: {raw_datasets['train'][0][feature][:200]}")

repo_name: kmike/scikit-learn
path: sklearn/utils/__init__.py
copies: 3
size: 10094
content: """
The :mod:`sklearn.utils` module includes various utilites.
"""

from collections import Sequence

import numpy as np
from scipy.sparse import issparse

from .murmurhash import murm
license: bsd-3-clause


In [58]:
context_length = 128

tokenizer = transformers.AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True
)
print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 34
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [59]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True
    )
    
    input_batch = []
    
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    
    return {"input_ids": input_batch}

tokenized_datasets = raw_datasets.map(tokenize, batched=True, remove_columns=raw_datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/606720 [00:00<?, ? examples/s]

Map:   0%|          | 0/3322 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 16702061
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 93164
    })
})

In [60]:
config = transformers.AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    boss_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

In [61]:
model = transformers.GPT2LMHeadModel(config)

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)

model_size = sum(param.numel() for param in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.2M parameters


In [62]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [63]:
res = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in res:
    print(f"{key} shape: {res[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [64]:
args = transformers.TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=False,
    push_to_hub=True
)

trainer = transformers.Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"]
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [21]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/497M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dneelay/codeparrot-ds/commit/9874d2b17766ca085352f04429b0581317c8510c', commit_message='End of training', commit_description='', oid='9874d2b17766ca085352f04429b0581317c8510c', pr_url=None, pr_revision=None, pr_num=None)

In [44]:
device = torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu")
pipe = transformers.pipeline("text-generation", model="huggingface-course/codeparrot-ds", device=device)

In [45]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""

response = pipe(txt, num_return_sequences=1)[0]["generated_text"]
print(response, "\n", "response length: ", len(response))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
exp = pd.Series 
 response length:  164


In [43]:
torch.backends.mps.is_built()

True