In [1]:
!pip install wandb huggingface_hub transformers datasets accelerate bitsandbytes

Collecting wandb
  Downloading wandb-0.17.5-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.12.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metad

In [2]:
from huggingface_hub import notebook_login
import os
import wandb

wandb.login()
notebook_login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch

context_length = 256
torch.manual_seed(42)

<torch._C.Generator at 0x7dcd51073a50>

In [4]:
from datasets import load_dataset

raw_dataset = load_dataset("ArtifactAI/arxiv_python_research_code")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/2.98k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/26 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/1415924 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/26 [00:00<?, ?it/s]

In [5]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['repo', 'file', 'code', 'file_length', 'avg_line_length', 'max_line_length', 'extension_type'],
        num_rows: 1415924
    })
})

In [6]:
from datasets import load_dataset, DatasetDict

ds_train = raw_dataset["train"]

raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(50000)),
        "valid": ds_train.shuffle().select(range(100))
    }
)

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")

tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [8]:
def tokenize(element):
    outputs = tokenizer(
    element["code"],
    truncation = True,
    max_length = context_length,
    return_overflowing_tokens = True,
    return_length = True
)
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)

    return {
        "input_ids": input_batch
    }


tokenized_datasets = raw_datasets.map(
    tokenize, batched = True, remove_columns = raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 455147
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 698
    })
})

### Model Fine-tuning

In [9]:
import os
os.environ["WANDB_PROJECT"] = "python_code"

In [10]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size = len(tokenizer),
    n_ctx = context_length,
    bos_token_id = tokenizer.bos_token_id,
    eos_token_id = tokenizer.eos_token_id
)

model = GPT2LMHeadModel(config)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [11]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)

In [12]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir = "/content/python_code",
    per_device_train_batch_size= 32,
    per_device_eval_batch_size = 32,
    eval_strategy="epoch",
    eval_steps=25,
    logging_steps=25,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=1e-3,
    save_steps = 100,
    fp16 = True,
    report_to="wandb"
)

trainer = Trainer(
    model = model,
    tokenizer=tokenizer,
    args = args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"]
)

In [13]:
trainer.train()



[34m[1mwandb[0m: Currently logged in as: [33msingh-ayushk1128[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,2.105,2.339702


TrainOutput(global_step=1778, training_loss=3.187846030373675, metrics={'train_runtime': 7478.1447, 'train_samples_per_second': 60.864, 'train_steps_per_second': 0.238, 'total_flos': 5.9463142244352e+16, 'train_loss': 3.187846030373675, 'epoch': 1.0})

In [19]:
import os
from huggingface_hub import HfApi

# Set the token as an environment variable
os.environ["HUGGINGFACE_HUB_TOKEN"] = "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"

# Initialize the API
api = HfApi()

# Create a new repository
repo_url = api.create_repo(repo_id="phoenix28/python_code_gen", private=True)
print(f"Repository created at: {repo_url}")

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-66b39ca4-383ac3844f82bdf24cf1f0c4;86f4b7b4-70b0-47c6-84ef-b40e7426e09e)

Invalid username or password.

In [14]:
trainer.push_to_hub()

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-66b399af-69750e333f50b1983e0db019;a661e924-0735-4225-841e-b12c2a44bb92)

Invalid username or password.

In [None]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model = "phoenix28/python_code", device = device
)

In [None]:
wandb.run.finish()