In [2]:
import pandas as pd
import numpy as np
import json
import ast
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset 
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration, RobertaTokenizer

from transformers import AdamW
from tqdm import tqdm

In [3]:
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import AutoModelForSeq2SeqLM
from torch.cuda.amp import autocast, GradScaler

In [4]:
torch.cuda.empty_cache()

## Preprocessing data

In [None]:
error_summary_df = pd.read_csv("./Error Generation/results/error_summary.csv")
error_summary_df.head(5)

In [None]:
error_summary_df = error_summary_df.set_index("problem_id")

In [None]:
error_summary_df.head(5)

In [None]:
error_summary_df["solution"] = error_summary_df["solution"].apply(lambda x: ast.literal_eval(x) if isinstance(x,str) else x)

In [None]:
import random
import ast

def generate_input_output_pairs(df):
    input_output_pairs = []

    for idx, row in df.iterrows():
        # Ensure 'solution' is a list
        if isinstance(row['solution'], str):
            solutions = ast.literal_eval(row['solution'])  # Convert string to list
        else:
            solutions = row['solution']
        
        # Shuffle the solutions and pick one
        if solutions and isinstance(solutions, list):
            index = random.randrange(0,len(solutions))
            expected_output = solutions[index]
        else:
            expected_output = "No valid solution"

        # Generate the input text
        input_text = (
            f"""Generated_output: {row['generated_output']}, expected_output: {expected_output} """
            f"""What is the error or difference?"""
        )
        
        # The output text is the summary
        output_text = row['summary']

        # Append the pair to the list
        input_output_pairs.append((input_text, output_text))

    return input_output_pairs


In [None]:
training_data = generate_input_output_pairs(error_summary_df)
print(training_data[0])

In [None]:
len(training_data)

In [None]:
train_data, test_data = train_test_split(training_data, test_size = 0.09, random_state = 42)

In [None]:
def preprocess_data(data):
    processed_data = []
    for input_text, output_text in data:
        processed_data.append({
            'input_text': input_text,
            'output_text': output_text
        })
    return processed_data

In [None]:
train_data_processed = preprocess_data(train_data)

In [None]:
test_data_processed = preprocess_data(test_data)

In [None]:
with open('train_data.json', 'w') as train_file:
    json.dump(train_data_processed, train_file, indent=4)

with open('test_data.json', 'w') as test_file:
    json.dump(test_data_processed, test_file, indent=4)

## FineTuning CodeT5

In [None]:
model_name = "Salesforce/codet5-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
with init_empty_weights():
    model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-small")

In [None]:
device_map = infer_auto_device_map(model, max_memory={0: "4GiB", "cpu": "8GiB"})

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("Salesforce/codet5-base", device_map=device_map)

In [None]:
class CodeT5Dataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item['input_text']
        output_text = item['output_text']

        # Tokenize inputs and labels
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        labels = self.tokenizer(
            output_text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        # Return tokenized inputs and labels
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze()
        }

In [None]:
train_dataset = CodeT5Dataset(train_data_processed, tokenizer)
test_dataset = CodeT5Dataset(test_data_processed, tokenizer)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4)

### Training Loop

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
epochs = 3

In [None]:
scaler = GradScaler()

In [None]:
for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()  # Clear previous gradients

        # Move inputs to GPU
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass with mixed precision
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Backward pass with scaled loss
        scaler.scale(loss).backward()

        # Update the weights
        scaler.step(optimizer)
        scaler.update()

        # Accumulate loss for monitoring
        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(train_loader)}")

In [None]:
print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.current_device())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")