In [None]:
!pip install wandb huggingface_hub transformers datasets accelerate bitsandbytes

Collecting wandb
  Downloading wandb-0.17.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.12.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from huggingface_hub import notebook_login
import wandb
import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
import os

In [None]:
# Login to Weights and Biases and Hugging Face Hub
wandb.login()
notebook_login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Set seed for reproducibility
torch.manual_seed(42)

# Load dataset
dataset = load_dataset("ArtifactAI/arxiv_python_research_code")
dataset_train = dataset["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/2.98k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/26 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/1415924 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/26 [00:00<?, ?it/s]

In [None]:
# Prepare datasets for training and validation
split_datasets = DatasetDict({
    "train": dataset_train.shuffle(seed=42).select(range(50000)),
    "valid": dataset_train.shuffle(seed=42).select(range(500))
})

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
context_length = 256

tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [None]:
# Tokenization function
def tokenize_data(batch):
    outputs = tokenizer(
        batch["code"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True
    )
    batch_input_ids = [
        input_ids for length, input_ids in zip(outputs["length"], outputs["input_ids"]) if length == context_length
    ]
    return {"input_ids": batch_input_ids}

# Tokenize datasets
tokenized_datasets = split_datasets.map(tokenize_data, batched=True, remove_columns=split_datasets["train"].column_names)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# Configure model
model_config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

# Initialize model
model = GPT2LMHeadModel(model_config)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
# Prepare data collator
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Quack/python_code_gen",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    logging_steps=25,
    eval_steps=25,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    learning_rate=1e-3,
    save_steps=100,
    fp16=True,
    report_to="wandb"
)



In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"]
)

In [None]:
# Fine-tune the model
trainer.train()
trainer.push_to_hub()



Epoch,Training Loss,Validation Loss
0,2.1157,2.183318


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/497M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/phoenix28/python_code_gen/commit/0ddfc5ec988287b35b70cda0b74a6c95612f9294', commit_message='End of training', commit_description='', oid='0ddfc5ec988287b35b70cda0b74a6c95612f9294', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Integrate quantization
from transformers import GPT2LMHeadModel
#from torch.quantization import quantize_dynamic

model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/Quack/python_code_gen")
#quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

In [None]:
# Initialize pipeline for inference
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
generator = pipeline(
    "text-generation", model='phoenix28/python_code_gen', tokenizer=tokenizer, device=device
)

config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/497M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
wandb.finish()

In [None]:
# Function to generate output
def generate_output(prompt):
    return generator(prompt, num_return_sequences=1)[0]["generated_text"]

In [None]:
# Example prompt
prompt = """
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(generate_output(prompt))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
y = np.linspace(-1.0, 1.


In [None]:
print(generate_output(prompt))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
x = np.random.randn(100)
x
