In [16]:
! pip install transformers datasets
! pip install transformers[torch] accelerate -U
! pip install python-dotenv



In [17]:
! pip install PyGithub datasets



In [18]:
from github import Github
import re 
from datasets import Dataset
import os
from dotenv import load_dotenv

load_dotenv()

# load the GitHub token from the environment variables
api_token = os.getenv("api_token")

# initialize PyGithub with the GitHub token
g = Github(api_token)

# specify the repository
repo = g.get_repo("openai/gym")

# function to extract Python functions from a script
def extract_functions_from_code(code):
    pattern = re.compile(r"def\s+(\w+)\s*\(.*\):")
    functions = pattern.findall(code)
    return functions

# fetch Python files from the repository 
python_files = []
contents = repo.get_contents("")
while contents:
    file_content = contents.pop(0)
    if file_content.type == "dir":
        contents.extend(repo.get_contents(file_content.path))
    else:
        if file_content.path.endswith(".py"):
            python_files.append(file_content)

# extract functions and create dataset
data = {"code": [], "function_name": []}
for file in python_files:
    code = file.decoded_content.decode("utf-8")
    functions = extract_functions_from_code(code)
    for function in functions:
        data["code"].append(code)
        data["function_name"].append(function)

# create a Hugging Face dataset
dataset = Dataset.from_dict(data)

# save the dataset to disk
dataset.save_to_disk("code_generation_dataset")

print("Dataset created and saved to disk.")


Saving the dataset (1/1 shards): 100%|██████████| 974/974 [00:00<00:00, 120501.80 examples/s]

Dataset created and saved to disk.





Now, we will use a pre-trained LLM model from Salesforce to fine-tune the model on our dataset for the task of code generation:

In [19]:
from datasets import load_from_disk
from transformers import AutoTokenizer , AutoModelForCausalLM , Trainer , TrainingArguments

# load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")

# set the pad_token to eos_token or add a new pad token
tokenizer.pad_token = tokenizer.eos_token

# load the dataset
dataset = load_from_disk("code_generation_dataset")

# split the dataset into training and test sets
dataset = dataset.train_test_split(test_size=0.1)

# preprocess the dataset
def preprocess_function(examples):
    inputs = tokenizer(examples["code"], padding="max_length", truncation=True)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs


The above step prepares the data for efficient fine-tuning of the code generation model. And now, here’s how to fine-tune the model:

In [20]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# fine-tune the model
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(model = model , 
                    args = training_args , 
                    train_dataset = tokenized_datasets["train"] , 
                    eval_dataset = tokenized_datasets["test"])

trainer.train()

Map: 100%|██████████| 876/876 [00:01<00:00, 739.42 examples/s]
Map: 100%|██████████| 98/98 [00:00<00:00, 358.55 examples/s]
  0%|          | 0/438 [23:38<?, ?it/s]
100%|██████████| 438/438 [6:49:50<00:00, 56.14s/it]   

{'train_runtime': 24590.3223, 'train_samples_per_second': 0.036, 'train_steps_per_second': 0.018, 'train_loss': 0.1876330092617366, 'epoch': 1.0}





TrainOutput(global_step=438, training_loss=0.1876330092617366, metrics={'train_runtime': 24590.3223, 'train_samples_per_second': 0.036, 'train_steps_per_second': 0.018, 'total_flos': 3275396820762624.0, 'train_loss': 0.1876330092617366, 'epoch': 1.0})

This step will take time, depending on the computing power of your system. After this step, here’s how we can test our code generation model:

In [21]:
# define a function to generate code using the fine-tuned model
def generate_code(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(inputs['input_ids'], max_length=max_length)
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_code

# test the model with a code generation prompt
prompt = "def merge_sort(arr):"
generated_code = generate_code(prompt)

print("Generated Code:")
print(generated_code)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Code:
def merge_sort(arr):
    if len(arr) > 1:
        mid = len(arr) // 2
        left = arr[:mid]
        right = arr[mid:]

        merge_sort(left)
        merge_sort(right)

        i = 0
        j = 0
        k = 0
        while i < len(left) and j < len(right):
            if left[i]
