In [21]:
import json

# Load and preprocess the data
with open('data/gptCodeSnippets.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

codes = [item['code'] for item in data]
types = [item['type'] for item in data]


print(codes[0])
print(types[0])

def quicksort(arr):
    """
    QuickSort algorithm - Sorts a list of integers
    params: list of integers
    returns: sorted list of integers
    """
    if len(arr) <= 1:
        return arr
    pivot = arr[len(arr) // 2]
    left = [x for x in arr if x < pivot]
    middle = [x for x in arr if x == pivot]
    right = [x for x in arr if x > pivot]
    return quicksort(left) + middle + quicksort(right)
Sorting


In [22]:
import os
import json
import torch
from transformers import RobertaTokenizer, RobertaModel
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# Load and preprocess the data
with open('data/gptCodeSnippets.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

codes = [item['code'] for item in data]
types = [item['type'] for item in data]

# Save the pre-trained CodeBERT model and tokenizer
pretrained_codebert_model_dir = './pretrained_codebert_model'
if not os.path.exists(pretrained_codebert_model_dir):
    codebert_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    codebert_model = RobertaModel.from_pretrained("microsoft/codebert-base")
    codebert_model.save_pretrained(pretrained_codebert_model_dir)
    codebert_tokenizer.save_pretrained(pretrained_codebert_model_dir)
else:
    codebert_tokenizer = RobertaTokenizer.from_pretrained(pretrained_codebert_model_dir)
    codebert_model = RobertaModel.from_pretrained(pretrained_codebert_model_dir)

# Function to generate embeddings with padding and batching
def generate_embeddings(codes, batch_size=16, max_length=512):
    all_embeddings = []
    for i in range(0, len(codes), batch_size):
        batch = codes[i:i + batch_size]
        inputs = codebert_tokenizer(batch, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length)
        with torch.no_grad():
            outputs = codebert_model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        all_embeddings.append(embeddings.cpu())
    return torch.cat(all_embeddings)

# Generate embeddings for our codes
embeddings = generate_embeddings(codes)

# Create label mapping
label_to_idx = {label: idx for idx, label in enumerate(set(types))}
labels = torch.tensor([label_to_idx[label] for label in types])

class CodeClassifier(torch.nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CodeClassifier, self).__init__()
        self.linear = torch.nn.Linear(input_dim, num_classes)

    def forward(self, x):
        return self.linear(x)

# Create dataset and dataloader
dataset = TensorDataset(embeddings, labels)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Initialize classifier and optimizer
classifier = CodeClassifier(input_dim=embeddings.size(1), num_classes=len(label_to_idx))
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)
fine_tuned_classifier_dir = './fine_tuned_code_classifier'
os.makedirs(fine_tuned_classifier_dir, exist_ok=True)

# Training loop with evaluation and checkpointing
for epoch in range(5):
    classifier.train()
    for batch in dataloader:
        inputs, targets = batch
        
        outputs = classifier(inputs)
        loss = F.cross_entropy(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")
    
    # Save checkpoint
    torch.save(classifier.state_dict(), os.path.join(fine_tuned_classifier_dir, f'classifier_epoch_{epoch+1}.pt'))

# Save the final classifier
final_classifier_dir = './final_fine_tuned_code_classifier'
os.makedirs(final_classifier_dir, exist_ok=True)
torch.save(classifier.state_dict(), os.path.join(final_classifier_dir, 'classifier.pt'))

# Function to classify code
def classify_code(code):
    embedding = generate_embeddings([code])
    output = classifier(embedding)
    predicted_class = torch.argmax(output, dim=1).item()
    idx_to_label = {v: k for k, v in label_to_idx.items()}
    return idx_to_label[predicted_class]

# Example usage



Epoch 1, Loss: 2.7180190086364746
Epoch 2, Loss: 2.1995134353637695
Epoch 3, Loss: 2.3755178451538086
Epoch 4, Loss: 1.6703535318374634
Epoch 5, Loss: 2.5723118782043457


In [23]:
example_code = """
def add(a, b):
    return a + b"""

predicted_class = classify_code(example_code)
print("Predicted Class:", predicted_class)

Predicted Class: Basic Syntax


In [24]:
example_code= """def last_word_in_alphabetical_order_by_gpt(text: str):
    '''
     Returns the last (alphabetically) word that can appear in the text.

     :param text: A string containing multiple words separated by spaces.
     :return: The last (alphabetically) word in the text.
     :time complexity: O(n*log(n)), where n is the number of words in the text.
     :space complexity: O(n), where n is the number of words in the text.
     '''

    words = text.split()  # Split the text into individual words
    words.sort()  # Sort the words alphabetically
    return words[-1]  # Return the last (alphabetically) word"""
predicted_class = classify_code(example_code)
print("Predicted Class:", predicted_class)

Predicted Class: Array


In [25]:
data_to_test_file='my_problems_solved/goodFormat.json'
with open(data_to_test_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

codes = [item['code'] for item in data]
types = [item['type'] for item in data]
for code in codes:
    print('-'*50)
    predicted_class = classify_code(code)
    print("Code:", code[:50] + '...' if len(code) > 50 else code)
    print("Predicted Class:", predicted_class)
    print("Real Class:", types[codes.index(code)])

--------------------------------------------------
Code: def last_word_in_alphabetical_order_by_me(text: st...
Predicted Class: Data Structures
Real Class: String
--------------------------------------------------
Code: def compare_real_numbers(a, b):
    '''
    Functi...
Predicted Class: Data Structures
Real Class: Math
--------------------------------------------------
Code: def problema_3(vector1: list, vector2: list) -> fl...
Predicted Class: Data Structures
Real Class: Math
--------------------------------------------------
Code: def cuvinte_unice(text):
    """
    Găsește cuvin...
Predicted Class: Data Structures
Real Class: String
--------------------------------------------------
Code: def problema_5(sequence:list)->int:
    """
    De...
Predicted Class: Data Structures
Real Class: Array
--------------------------------------------------
Code: def test_problema_6():
    assert (problema_6([2,8...
Predicted Class: Data Structures
Real Class: Array
----------------------------