In [1]:
import pandas as pd
import numpy as np
import json
import ast
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset 
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration, RobertaTokenizer

from transformers import AdamW
from tqdm import tqdm

In [2]:
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import AutoModelForSeq2SeqLM
from torch.cuda.amp import autocast, GradScaler

In [3]:
torch.cuda.empty_cache()

In [4]:
torch.cuda.memory_summary(device=None, abbreviated=False)



## Preprocessing data

In [5]:
error_summary_df = pd.read_csv("./Error Generation/results/error_summary.csv")
error_summary_df.head(5)

Unnamed: 0,problem_id,summary,prompt,solution,generated_output
0,1053,The code compiles but does not pass any tests....,Chef has a binary array in an unsorted manner....,"[""# cook your dish here\nfor _ in range(int(in...",def solve():\n n = int(input())\n arr = list...
1,2645,The code compiles and passes all tests. It has...,=====Function Descriptions=====\nitertools.com...,"[""from itertools import *\ns,n = input().split...",def solve():\n n = int(input())\n arr = list...
2,3325,The code compiles and passes some tests but no...,"Given a string, s, return a new string that or...","[""from collections import Counter\n\ndef most_...",def solve():\n n = int(input())\n arr = list...
3,2336,The code fails to compile. Fix compilation iss...,Petr likes to come up with problems about rand...,"[""n = int(input())\nl = [int(x) - 1 for x in i...",def solve():\n n = int(input())\n arr = list...
4,4377,The code fails to compile. Fix compilation iss...,Alice and Bob have participated to a Rock Off ...,"[""def solve(a, b):\n alice = sum(i > j for ...",def solve():\n n = int(input())\n arr = list...


In [6]:
error_summary_df = error_summary_df.set_index("problem_id")

In [7]:
error_summary_df.head(5)

Unnamed: 0_level_0,summary,prompt,solution,generated_output
problem_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1053,The code compiles but does not pass any tests....,Chef has a binary array in an unsorted manner....,"[""# cook your dish here\nfor _ in range(int(in...",def solve():\n n = int(input())\n arr = list...
2645,The code compiles and passes all tests. It has...,=====Function Descriptions=====\nitertools.com...,"[""from itertools import *\ns,n = input().split...",def solve():\n n = int(input())\n arr = list...
3325,The code compiles and passes some tests but no...,"Given a string, s, return a new string that or...","[""from collections import Counter\n\ndef most_...",def solve():\n n = int(input())\n arr = list...
2336,The code fails to compile. Fix compilation iss...,Petr likes to come up with problems about rand...,"[""n = int(input())\nl = [int(x) - 1 for x in i...",def solve():\n n = int(input())\n arr = list...
4377,The code fails to compile. Fix compilation iss...,Alice and Bob have participated to a Rock Off ...,"[""def solve(a, b):\n alice = sum(i > j for ...",def solve():\n n = int(input())\n arr = list...


In [8]:
error_summary_df["solution"] = error_summary_df["solution"].apply(lambda x: ast.literal_eval(x) if isinstance(x,str) else x)

In [9]:
import random
import ast

def generate_input_output_pairs(df):
    input_output_pairs = []

    for idx, row in df.iterrows():
        # Ensure 'solution' is a list
        if isinstance(row['solution'], str):
            solutions = ast.literal_eval(row['solution'])  # Convert string to list
        else:
            solutions = row['solution']
        
        # Shuffle the solutions and pick one
        if solutions and isinstance(solutions, list):
            index = random.randrange(0,len(solutions))
            expected_output = solutions[index]
        else:
            expected_output = "No valid solution"

        # Generate the input text
        input_text = (
            f"""Generated_output: {row['generated_output']}, expected_output: {expected_output} """
            f"""What is the error or difference?"""
        )
        
        # The output text is the summary
        output_text = row['summary']

        # Append the pair to the list
        input_output_pairs.append((input_text, output_text))

    return input_output_pairs


In [10]:
training_data = generate_input_output_pairs(error_summary_df)
print(training_data[0])

('Generated_output: def solve():\n  n = int(input())\n  arr = list(map(int, input().split()))\n  arr.sort()\n  \n  for i in range(n - 1):\n    if arr[i] == 0 and arr[i+1] == 1:\n      print(i)\n      return\n\n  print(n-1)\n\n\nt = int(input())\nfor _ in range(t):\n  solve(), expected_output: # cook your dish here\nt = int(input())\nfor i in range(t):\n    N = int(input())\n    lst = list(map(int,input().split()))\n    count = 0\n    lst.sort()\n    for j in lst:\n        if j == 0 :\n            count = count+1 \n    print(count)        \n What is the error or difference?', 'The code compiles but does not pass any tests. It has moderate similarity to the reference.')


In [11]:
len(training_data)

4882

In [12]:
train_data, test_data = train_test_split(training_data, test_size = 0.09, random_state = 42)

In [13]:
def preprocess_data(data):
    processed_data = []
    for input_text, output_text in data:
        processed_data.append({
            'input_text': input_text,
            'output_text': output_text
        })
    return processed_data

In [14]:
train_data_processed = preprocess_data(train_data)

In [15]:
test_data_processed = preprocess_data(test_data)

In [16]:
with open('train_data.json', 'w') as train_file:
    json.dump(train_data_processed, train_file, indent=4)

with open('test_data.json', 'w') as test_file:
    json.dump(test_data_processed, test_file, indent=4)

## FineTuning CodeT5

In [28]:
model_name = "Salesforce/codet5-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
with init_empty_weights():
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [29]:
device_map = infer_auto_device_map(model, max_memory={0: "4GiB", "cpu": "8GiB"})
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-base", device_map=device_map)

In [30]:
class CodeT5Dataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item['input_text']
        output_text = item['output_text']

        # Tokenize inputs and labels
        inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        labels = self.tokenizer(
            output_text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        # Return tokenized inputs and labels
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels["input_ids"].squeeze()
        }

In [31]:
train_dataset = CodeT5Dataset(train_data_processed, tokenizer)
test_dataset = CodeT5Dataset(test_data_processed, tokenizer)

In [32]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1)

### Training Loop

In [33]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [35]:
epochs = 3
gradient_accumulation_steps = 4

In [36]:
scaler = GradScaler()

  scaler = GradScaler()


In [37]:
from tqdm import tqdm
import time

for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    # Wrap the DataLoader with tqdm for progress tracking
    loop = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch")

    for step, batch in enumerate(loop):
        start_time = time.time()  # Start timer for the batch

        # Move inputs to GPU
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass with mixed precision
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps  # Scale loss for gradient accumulation

        # Backward pass with scaled loss
        scaler.scale(loss).backward()

        # Update the weights every `gradient_accumulation_steps`
        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()  # Clear accumulated gradients

        # Accumulate loss for monitoring
        epoch_loss += loss.item() * gradient_accumulation_steps  # Unscale the loss for reporting

        # Update the tqdm progress bar
        loop.set_postfix(
            loss=loss.item(),  # Current batch loss
            batch_time=f"{time.time() - start_time:.2f}s"  # Time taken for the batch
        )

    print(f"Epoch {epoch + 1}, Average Loss: {epoch_loss / len(train_loader):.4f}")

  with autocast():
Epoch 1/3: 100%|██████████████████████████████| 4442/4442 [4:26:38<00:00,  3.60s/batch, batch_time=3.45s, loss=0.00187]


Epoch 1, Average Loss: 0.0277


Epoch 2/3: 100%|█████████████████████████████| 4442/4442 [4:17:21<00:00,  3.48s/batch, batch_time=3.41s, loss=0.000556]


Epoch 2, Average Loss: 0.0038


Epoch 3/3: 100%|█████████████████████████████| 4442/4442 [4:21:22<00:00,  3.53s/batch, batch_time=3.43s, loss=0.000427]

Epoch 3, Average Loss: 0.0035





In [38]:
model.save_pretrained("./fine_tuned_codet5")
tokenizer.save_pretrained("./fine_tuned_codet5")

('./fine_tuned_codet5\\tokenizer_config.json',
 './fine_tuned_codet5\\special_tokens_map.json',
 './fine_tuned_codet5\\vocab.json',
 './fine_tuned_codet5\\merges.txt',
 './fine_tuned_codet5\\added_tokens.json')

### Testing the finetuned model

In [39]:
def generate_output(input_text):
    model.eval()
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length").to(device)
    with torch.no_grad():
        outputs = model.generate(inputs["input_ids"], max_length=512)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model
for test_case in test_data_processed[:5]:
    input_text = test_case["input_text"]
    expected_output = test_case["output_text"]

    generated_output = generate_output(input_text)
    print(f"Input: {input_text}")
    print(f"Generated Output: {generated_output}")
    print(f"Expected Output: {expected_output}\n")

Input: Generated_output: def solve():
  n = int(input())
  arr = list(map(int, input().split()))
  arr.sort()
  
  for i in range(n - 1):
    if arr[i] == 0 and arr[i+1] == 1:
      print(i)
      return

  print(n-1)


t = int(input())
for _ in range(t):
  solve(), expected_output: class Solution:
    def thousandSeparator(self, n: int) -> str:
        arr = []
        i, count = 0, 0
        num = str(n)
        while i < len(num):
            if count != 3:
                arr.append(num[~i])
                i += 1
                count += 1
            else:
                arr.append('.')
                count = 0
                
        return ''.join(arr[::-1])
 What is the error or difference?
Generated Output: The code fails to compile. Fix compilation issues first. It is significantly different from the reference.
Expected Output: The code fails to compile. Fix compilation issues first. It is significantly different from the reference.

Input: Generated_output: def solve():
