In [11]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the dataset
data_path = './data/raw/samples.csv'
df = pd.read_csv(data_path)

# Prepare the dataset for the Hugging Face `datasets` library
df['translation'] = df.apply(lambda row: {'dart': row['dart'], 'javascript': row['javascript']}, axis=1)
dataset = Dataset.from_pandas(df[['translation']])

# Function to preprocess the data
def preprocess_function(examples):
    inputs = [f"translate Dart to Javascript: {ex['dart']}" for ex in examples['translation']]
    targets = [ex['javascript'] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length").input_ids

    model_inputs['labels'] = labels
    return model_inputs

# Load the tokenizer and model
model_name = "Salesforce/codet5-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# Preprocess the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    fp16=torch.cuda.is_available(),
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Check if the Trainer is using the right device
print(f"Trainer device: {trainer.args.device}")

Using device: cuda


Map:   0%|          | 0/212 [00:00<?, ? examples/s]



Trainer device: cuda:0


In [None]:
# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Test the fine-tuned model with a sample translation
python_code = "print('Hello, world!')"
input_text = f"translate Dart to Javascript: {python_code}"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate the translation
with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=50)

# Decode the output
translated_code = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Translated code:\n{translated_code}")