In [1]:
!pip install transformers peft datasets wandb huggingface_hub

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting wandb
  Downloading wandb-0.17.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model
from huggingface_hub import notebook_login
import wandb
import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import pipeline
import os

In [4]:
# Login to Weights and Biases and Hugging Face Hub
wandb.login()
notebook_login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# Set seed for reproducibility
torch.manual_seed(42)

# Load dataset
dataset = load_dataset("ArtifactAI/arxiv_python_research_code")
dataset_train = dataset["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/2.98k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/26 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/1415924 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/26 [00:00<?, ?it/s]

In [6]:
# Prepare datasets for training and validation
split_datasets = DatasetDict({
    "train": dataset_train.shuffle(seed=42).select(range(50000)),
    "valid": dataset_train.shuffle(seed=42).select(range(500))
})

In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")
context_length = 256

tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [8]:
# Tokenization function
def tokenize_data(batch):
    outputs = tokenizer(
        batch["code"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True
    )
    batch_input_ids = [
        input_ids for length, input_ids in zip(outputs["length"], outputs["input_ids"]) if length == context_length
    ]
    return {"input_ids": batch_input_ids}

# Tokenize datasets
tokenized_datasets = split_datasets.map(tokenize_data, batched=True, remove_columns=split_datasets["train"].column_names)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [9]:
# Load model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],  # Targeting the Conv1D layers in GPT-2
    lora_dropout=0.1,
    bias="none",
)
model = get_peft_model(model, lora_config)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



In [10]:
from transformers import DataCollatorForLanguageModeling

# Prepare data collator with labels
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, return_tensors="pt"
)

In [11]:
from transformers import Trainer, TrainingArguments

# Adjust the training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Quack/python_code_lora",
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    logging_steps=25,
    num_train_epochs=1,
    learning_rate=1e-3,
    save_steps=100,
    report_to="wandb",
    gradient_accumulation_steps=4,  # Gradient accumulation
    fp16=True  # Mixed precision training
)

# Initialize Trainer with the updated arguments
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"]
)

# Fine-tune the model
torch.cuda.empty_cache()  # Clear cache before training
trainer.train()
trainer.push_to_hub()

[34m[1mwandb[0m: Currently logged in as: [33msingh-ayushk1128[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112736711111716, max=1.0…

Epoch,Training Loss,Validation Loss
0,3.7582,No log


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/3.25M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/phoenix28/python_code_lora/commit/1aef72972aed8caa67c300bbb89da0a1719ebe65', commit_message='End of training', commit_description='', oid='1aef72972aed8caa67c300bbb89da0a1719ebe65', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
generator = pipeline(
    "text-generation", model='phoenix28/python_code_lora', tokenizer=tokenizer, device=device
)

adapter_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

In [14]:
wandb.finish()

VBox(children=(Label(value='0.026 MB of 0.026 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███▁
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███▁
train/grad_norm,▅▇▅▂▄▄▄▅▃▂▂▃▅▁▄█▃▄▄▄▂▅▇▂▃▃▄▇▇▂▆▇▄▃▃▂▃▁▃▇
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁█
train/loss,█▇▅▅▄▄▄▄▃▄▃▄▃▃▃▃▂▃▂▃▂▂▃▁▂▂▁▂▂▂▃▂▂▁▂▂▂▁▂▃

0,1
eval/runtime,27.3821
eval/samples_per_second,141.845
eval/steps_per_second,35.461
total_flos,6.05921132717015e+16
train/epoch,0.01219
train/global_step,350.0
train/grad_norm,0.87677
train/learning_rate,0.00099
train/loss,3.818
train_loss,3.85869


In [15]:
# Function to generate output
def generate_output(prompt):
    return generator(prompt, num_return_sequences=1)[0]["generated_text"]

In [16]:
# Example prompt
prompt = """
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(generate_output(prompt))


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
�en np.random.randn(100, y


### Quantization:

GPTQ is a post-training quantization (PTQ) method for 4-bit quantization that focuses primarily on GPU inference and performance.

The idea behind the method is that it will try to compress all weights to a 4-bit quantization by minimizing the mean squared error to that weight. During inference, it will dynamically dequantize its weights to float16 for improved performance whilst keeping memory low.

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load LLM and Tokenizer
model_id = "phoenix28/python_code_lora"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    trust_remote_code=False,
    revision="main"
)

# Create a pipeline
pipe = pipeline(model=model, tokenizer=tokenizer, task='text-generation')

tokenizer_config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/448k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.25M [00:00<?, ?B/s]

In [21]:
prompt = """
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(prompt)


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y



In [27]:
outputs = pipe(
    prompt,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    repetition_penalty=2.0
)
print(outputs[0]["generated_text"])


# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
def test_error():
    if self._get('X', not '2graph':
        m1d----0[j] += 1

        # If return True: for a-n in 0 and b
                w/k
		
