In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load your dataset
data = pd.read_csv("code.csv", delimiter=";")  # Adjust delimiter if necessary

# Step 2: Split the dataset into train and test sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Save the train and test sets to CSV files
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

# Step 4: Load the datasets in Hugging Face format
train_dataset = load_dataset("csv", data_files={"train": "train.csv"})["train"]
test_dataset = load_dataset("csv", data_files={"test": "test.csv"})["test"]

print(train_dataset)
print(test_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['def add_numbers(a, b):   print(a + b)', 'add_numbers(5, 3)', '8', 'Add two numbers', 'num', '2'],
    num_rows: 4039
})
Dataset({
    features: ['def add_numbers(a, b):   print(a + b)', 'add_numbers(5, 3)', '8', 'Add two numbers', 'num', '2'],
    num_rows: 1010
})


In [3]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

# Set pad token if missing
tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_data(example):
    # Combine the columns into one string as input
    inputs = (
        f"### Python Code:\n{example['python_code']}\n"
        f"### Reasoning:\n{example['human_reasoning']}\n"
        f"### Execution:\n{example['execution_example']}->{example['execution_result']}\n"
    )
    
    # Tokenize with truncation and padding
    tokenized = tokenizer(inputs, truncation=True, padding='max_length', max_length=40)
    return tokenized


# Load CSV files for train and test datasets separately
train_dataset = load_dataset('csv', data_files='train.csv', header=None)
test_dataset = load_dataset('csv', data_files='test.csv', header=None)

# Manually assign column names for both train and test datasets
train_dataset = train_dataset['train'].rename_column('0', 'python_code')
train_dataset = train_dataset.rename_column('3', 'human_reasoning')
train_dataset = train_dataset.rename_column('1', 'execution_example')
train_dataset = train_dataset.rename_column('2', 'execution_result')


test_dataset = test_dataset['train'].rename_column('0', 'python_code')
test_dataset = test_dataset.rename_column('3', 'human_reasoning')
test_dataset = test_dataset.rename_column('1', 'execution_example')
test_dataset = test_dataset.rename_column('2', 'execution_result')


# Apply tokenization to both datasets
train_dataset = train_dataset.map(tokenize_data, batched=False)
test_dataset = test_dataset.map(tokenize_data, batched=False)

# Print out a sample to verify tokenization
print(train_dataset[0])




tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4040 [00:00<?, ? examples/s]

Map:   0%|          | 0/1011 [00:00<?, ? examples/s]

{'python_code': 'def add_numbers(a, b):   print(a + b)', 'execution_example': 'add_numbers(5, 3)', 'execution_result': '8', 'human_reasoning': 'Add two numbers', '4': 'num', '5': 2, 'input_ids': [128000, 14711, 13325, 6247, 512, 755, 923, 34064, 2948, 11, 293, 1680, 256, 1194, 2948, 489, 293, 340, 14711, 27857, 287, 512, 2261, 1403, 5219, 198, 14711, 32028, 512, 723, 34064, 7, 20, 11, 220, 18, 4085, 23, 198, 128001], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]}


In [4]:
# fine tuning the model

from transformers import LlamaForCausalLM, Trainer, TrainingArguments

# Load the pre-trained Llama model
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")  # Adjust for the specific version

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    fp16=True,  # Enable FP16
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save at the end of each epoch
    logging_dir="./logs",
    save_total_limit=2,
    warmup_steps=100,
    weight_decay=0.01,
    optim="adamw_torch",
    load_best_model_at_end=True,  # Works since strategies match
    report_to="tensorboard"
)


# Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Start fine-tuning
trainer.train()

  trainer = Trainer(


ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [None]:
# monitoring training with tensorboard 
# to be launched in the terminal*
#tensorboard --logdir=./logs

In [None]:
# evaluating model performance
eval_results = trainer.evaluate()

# Print evaluation results
print(eval_results)

In [None]:
# saving the fine-tuned model
model.save_pretrained("./finetuned_llama_model") # the path is to be adjusted to the correct location
tokenizer.save_pretrained("./finetuned_llama_model")

In [None]:
# generating code with the fine tuned model

from transformers import pipeline

# Load the fine-tuned model
model = LlamaForCausalLM.from_pretrained("./finetuned_llama_model")
tokenizer = LlamaTokenizer.from_pretrained("./finetuned_llama_model")

# Use a pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate output
input_text = "### Python Code:\ndef factorial(n):\n    return n * factorial(n-1) if n > 1 else 1"
generated = generator(input_text, max_length=200)

print(generated[0]['generated_text'])
