In [1]:
import numpy as np

class ANN:
    def __init__(self, input_size=49152, hidden_size=12, output_size=10):
        self.a1, self.a2, self.z1, self.z2 = None, None, None, None
        self.w1 = np.random.randn(hidden_size, input_size) * 0.01
        self.w2 = np.random.randn(output_size, hidden_size) * 0.01
        self.b1 = np.zeros((hidden_size, 1))
        self.b2 = np.zeros((output_size, 1))
        self.learning_rate = 0.01

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def relu(self, z):
        return np.maximum(0, z)

    def relu_derivative(self, z):
        return np.where(z > 0, 1, 0)

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z))
        return exp_z / exp_z.sum(axis=0, keepdims=True)

    def forward(self, x):
        self.z1 = np.dot(self.w1, x) + self.b1
        self.a1 = self.relu(self.z1)
        self.z2 = np.dot(self.w2, self.a1) + self.b2
        self.a2 = self.softmax(self.z2)
        return self.a2

    def compute_loss(self, Y):
        m = Y.shape[1]
        logprobs = np.multiply(np.log(self.a2), Y)
        loss = -np.sum(logprobs) / m
        return loss

    def backward(self, x, y):
        m = y.shape[1]
        dZ2 = self.a2 - y
        dW2 = (1 / m) * np.dot(dZ2, self.a1.T)
        db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
        dZ1 = np.dot(self.w2.T, dZ2) * self.relu_derivative(self.z1)
        dW1 = (1 / m) * np.dot(dZ1, x.T)
        db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

        self.w1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1
        self.w2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2

    def fit(self, X, Y, epochs=1000, learning_rate=0.01):
        self.learning_rate = learning_rate
        for i in range(epochs):
            self.forward(X)
            loss = self.compute_loss(Y)
            self.backward(X, Y)
            if i % 100 == 0:
                print(f'Loss at epoch {i} is {loss}')

    def predict(self, X):
        predictions = self.forward(X)
        return np.argmax(predictions, axis=0)


In [2]:
import torch
import json

# Load the data
with open('data/gptCodeSnippets.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

codes = [item['code'] for item in data]
types = [item['type'] for item in data]

# Tokenize the code snippets using CodeBERT tokenizer
from transformers import RobertaTokenizer, RobertaModel

pretrained_codebert_model_dir = './pretrained_codebert_model_3'
codebert_tokenizer = RobertaTokenizer.from_pretrained(pretrained_codebert_model_dir)
codebert_model = RobertaModel.from_pretrained(pretrained_codebert_model_dir)

# Function to generate embeddings
def generate_embeddings(codes, batch_size=16, max_length=512):
    all_embeddings = []
    for i in range(0, len(codes), batch_size):
        batch = codes[i:i + batch_size]
        inputs = codebert_tokenizer(batch, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length)
        with torch.no_grad():
            outputs = codebert_model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        all_embeddings.append(embeddings.cpu().numpy())
    return np.concatenate(all_embeddings, axis=0)

# Generate embeddings for our codes
embeddings = generate_embeddings(codes)

# Create label mapping
label_to_idx = {label: idx for idx, label in enumerate(set(types))}
labels = np.array([label_to_idx[label] for label in types])

# One-hot encode the labels
num_classes = len(label_to_idx)
labels_one_hot = np.eye(num_classes)[labels].T

# Reshape the data
def reshape_data(train_inputs, train_outputs, test_inputs, test_outputs):
    train_inputs = np.array(train_inputs).reshape(train_inputs.shape[0], -1).T
    train_outputs = np.array(train_outputs).reshape(num_classes, -1)
    test_inputs = np.array(test_inputs).reshape(test_inputs.shape[0], -1).T
    test_outputs = np.array(test_outputs)
    return train_inputs, train_outputs, test_inputs, test_outputs

# Split data into training and testing sets
from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(embeddings, labels_one_hot.T, test_size=0.2, random_state=42)
train_inputs, train_outputs, test_inputs, test_outputs = reshape_data(train_inputs, train_outputs, test_inputs, test_outputs)


  return dynamo.is_compiling()
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [3]:
# Initialize and train the ANN
input_size = train_inputs.shape[0]
hidden_size = 23
output_size = num_classes

ann = ANN(input_size=input_size, hidden_size=hidden_size, output_size=output_size)
ann.fit(train_inputs, train_outputs, epochs=1000, learning_rate=0.01)


Loss at epoch 0 is 3.434043914658668
Loss at epoch 100 is 3.4305569336004647
Loss at epoch 200 is 3.4257681356411687
Loss at epoch 300 is 3.418174784807587
Loss at epoch 400 is 3.4091042504756524
Loss at epoch 500 is 3.4009570620208476
Loss at epoch 600 is 3.391737804672854
Loss at epoch 700 is 3.378254077650657
Loss at epoch 800 is 3.35872225984241
Loss at epoch 900 is 3.332372999705726


In [6]:
# Test the ANN on the provided test data
data_to_test_file = 'my_problems_solved/goodFormat.json'
with open(data_to_test_file, 'r', encoding='utf-8') as f:
    test_data = json.load(f)

test_codes = [item['code'] for item in test_data]
test_types = [item['type'] for item in test_data]
print(test_types)
test_labels = np.array([label_to_idx[label] for label in test_types])

# Generate embeddings for the test codes
test_embeddings = generate_embeddings(test_codes)
test_inputs2 = np.array(test_embeddings).reshape(test_embeddings.shape[0], -1).T

# Predict the classes for the test data
predicted_classes = ann.predict(test_inputs2)
print(predicted_classes)

# Print predictions and real classes
for i, code in enumerate(test_codes):
    print('-' * 50)
    print("Code:", code[:50] + '...' if len(code) > 50 else code)
    print("Predicted Class:", list(label_to_idx.keys())[predicted_classes[i]])
    print("Real Class:", test_types[i])


['String', 'Math', 'Math', 'String', 'Array', 'Array', 'Array', 'Binary', 'Dynamic Programming', 'Matrix']
[19 19 19 19 19 19 19 19 19 19]
--------------------------------------------------
Code: def last_word_in_alphabetical_order_by_me(text: st...
Predicted Class: Binary
Real Class: String
--------------------------------------------------
Code: def compare_real_numbers(a, b):
    '''
    Functi...
Predicted Class: Binary
Real Class: Math
--------------------------------------------------
Code: def problema_3(vector1: list, vector2: list) -> fl...
Predicted Class: Binary
Real Class: Math
--------------------------------------------------
Code: def cuvinte_unice(text):
    """
    Găsește cuvin...
Predicted Class: Binary
Real Class: String
--------------------------------------------------
Code: def problema_5(sequence:list)->int:
    """
    De...
Predicted Class: Binary
Real Class: Array
--------------------------------------------------
Code: def test_problema_6():
    assert (prob