1. Preparation for Training (Part 1/2)
    --> Performing Data Split    

In [None]:
# Preparation for Training

import json
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load your generated dataset
# Replace 'your_dataset_path' with the actual path to your dataset
dataset_path = 'data/analyzed_dataset_3_lines/analyzed_dataset_700.jsonl'

# Load data from the JSONL file
with open(dataset_path, 'r') as file:
    data = [json.loads(line) for line in file]

# Extract input and target values
magic_number_smells = [item['magic_number_smell'] for item in data]
refactored_codes = [item['refactored_code'] for item in data]

# Split the dataset into training and testing sets
train_magic_number_smells, test_magic_number_smells, train_refactored_codes, test_refactored_codes = train_test_split(
    magic_number_smells, refactored_codes, test_size=0.2, random_state=42
)

# Create dictionaries for training and testing datasets
train_dataset = [{'magic_number_smell': magic_number_smell, 'refactored_code': refactored_code} for magic_number_smell, refactored_code in zip(train_magic_number_smells, train_refactored_codes)]
test_dataset = [{'magic_number_smell': magic_number_smell, 'refactored_code': refactored_code} for magic_number_smell, refactored_code in zip(test_magic_number_smells, test_refactored_codes)]

# Save the datasets to JSONL files
train_file_path = 'data/train_dataset.jsonl'
test_file_path = 'data/test_dataset.jsonl'

2. Preparation for Training (Part 2/2)
    --> Initializing Tokenizer - CodeT5Tokenizer
    --> Initializing Optimizer - AdamW

In [None]:

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW

with open(train_file_path, 'w') as f:
    for item in train_dataset:
        f.write(json.dumps(item) + '\n')

with open(test_file_path, 'w') as f:
    for item in test_dataset:
        f.write(json.dumps(item) + '\n')

# Define a custom dataset class
class CodeDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        magic_number_smell = item['magic_number_smell']
        refactored_code = item['refactored_code']

        # Tokenize and convert to PyTorch tensors
        inputs = self.tokenizer.encode_plus(magic_number_smell, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        targets = self.tokenizer.encode_plus(refactored_code, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze(),
        }

# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Create datasets and dataloaders
train_dataset = CodeDataset(train_dataset, tokenizer)
test_dataset = CodeDataset(test_dataset, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Define training parameters
optimizer = AdamW(model.parameters(), lr=1e-4)

# Loading onto processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

3. Print Expected Refactored Code

In [None]:
# Print Expected Refactored Code
print(refactored_codes)

4. Training Loop

In [None]:
# Training Loop


train_losses = []
test_losses = []
last_3_test_losses = []  # Track last 5 test losses for early stopping
max_overfit_epochs = 3  # Maximum consecutive epochs for which test loss can increase before stopping

# Training loop
num_epochs = 40
stop_training = False  # Flag to indicate if training should stop

for epoch in range(num_epochs):
    model.train()
    epoch_train_losses = []
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs} (Training)'):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_train_losses.append(loss.item())

    # Calculate average training loss for the epoch
    train_loss = sum(epoch_train_losses) / len(epoch_train_losses)
    train_losses.append(train_loss)

    # Evaluate the model on the test dataset
    model.eval()
    epoch_test_losses = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs} (Testing)'):
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs, labels=labels)
            loss = outputs.loss

            epoch_test_losses.append(loss.item())

    # Calculate average testing loss for the epoch
    test_loss = sum(epoch_test_losses) / len(epoch_test_losses)
    test_losses.append(test_loss)

    # Print and/or log the training and testing losses for monitoring
    print(f"Epoch {epoch + 1}/{num_epochs} - Train Loss: {train_loss}, Test Loss: {test_loss}")

    # Save checkpoint after each epoch
    checkpoint_path = f'magic_smell_model_s_3lines_700_e40_b4_epoch_{epoch + 1}.pth'
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_loss': train_loss,
        'test_loss': test_loss
    }, checkpoint_path)
    
    # Early stopping condition for same test losses
    if len(last_3_test_losses) == 3:
        if all(loss == last_3_test_losses[0] for loss in last_3_test_losses):
            print("Early stopping: Test losses remained the same for 3 epochs.")
            stop_training = True
            break
        else:
            last_3_test_losses.pop(0)
    last_3_test_losses.append(test_loss)
    
    # Early stopping condition for overfitting
    if epoch > 0 and test_loss > test_losses[-2]:
        overfit_epochs += 1
        if overfit_epochs >= max_overfit_epochs:
            print(f"Early stopping: Test loss increased continuously for {max_overfit_epochs} epochs.")
            stop_training = True
            break
    else:
        overfit_epochs = 0


    if stop_training:
        break

# Save the trained model
model.save_pretrained('magic_smell_model_s_3lines_700_e40_b4')