In [1]:
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from accelerate import Accelerator

# Initialize the accelerator with mixed precision
accelerator = Accelerator(mixed_precision="fp16")

# Load the dataset
with open('data/gptCodeSnippets.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Prepare the dataset
dataset = []
for item in data:
    code = item['code']
    comment = item['specifications']
    dataset.append({'code': code, 'comment': comment})

# Convert to Hugging Face dataset
hf_dataset = Dataset.from_list(dataset)

# Tokenize the data
model_id = "openchat/openchat-3.6-8b-20240522"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    inputs = [f"Code:\n{code}\n\nComments:\n" for code in examples["code"]]
    targets = [comment for comment in examples["comment"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=512, truncation=True, padding="max_length")
    return model_inputs

tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)

# Fine-tune the model
model = AutoModelForCausalLM.from_pretrained(model_id)

# Prepare model and optimizer
model, optimizer, tokenized_dataset = accelerator.prepare(
    model,
    torch.optim.AdamW(model.parameters(), lr=2e-5),
    tokenized_dataset
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=1,  # Reduce batch size to save memory
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Use mixed precision
)

# Define data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    optimizers=(optimizer, None)
)

# Train the model
accelerator.wait_for_everyone()
trainer.train()

# Test the model with a new code snippet
def generate_comment(code_snippet):
    messages = [
        {"role": "user", "content": f"For each line in this Python code add a comment explaining what it does: {code_snippet}"},
    ]
    inputs = tokenizer(messages, return_tensors="pt", padding=True, truncation=True).to(accelerator.device)

    model.eval()
    with torch.no_grad():
        outputs = model.generate(inputs["input_ids"], max_new_tokens=1024, temperature=0.5)
    response = outputs[0][inputs["input_ids"].shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

# Example usage
code_snippet = "celsius = 37.5\nfahrenheit = (celsius * 1.8) + 32\nprint('%0.1f degree Celsius is equal to %0.1f degree Fahrenheit' %(celsius,fahrenheit))"
print(generate_comment(code_snippet))


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/133 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
