In [17]:
import torch
from transformers import AutoTokenizer, AutoModel, GPTNeoForCausalLM
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import random

In [18]:
def load_code() -> str:
    """
        Loads a Python written code from a given file.
        
        :return: A random code from the dataset.
    """
    n = random.randint(1, 20)
    code = ''
    
    with open(f"data/code/{n}.txt", 'r') as file:
        code = file.read()
        
    return code
    
def load_request() -> str:
    """
        Loads a code request from a given file.
        
        :rtype: str
        :return: The code request from the file.
    """
    n = random.randint(1, 11)
    with open(f"data/requests/{n}.txt", 'r') as file:
        return file.read()
    
def load_dataset(dataset_path: str) -> pd.DataFrame:
    """
        Loads the dataset for the classification process.
        
        :param dataset_path: The path of the file where the dataset is saved.
    """
    data = pd.read_csv(dataset_path, delimiter='\\')
    
    return data

def split_data(data: pd.DataFrame, test_size: float = 0.2) -> tuple:
    """
        Splits the data into training and testing sets.
    
        :param data: The dataset to be split
        :param test_size: The proportion of the dataset to include in the test split
        
        :rtype: tuple
        :return: A tuple containing the training and testing data and labels.
    """
    
    X = data['code']
    y = data['label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [19]:
def generate_embeddings(
    code: str, tokenizer: AutoTokenizer, model: AutoModel, device: torch.device
):
    """
    Generates the embeddings for the code provided.

    :param code: The code.
    :param tokenizer: The tokenzizer.
    :param model: The model.
    :param device: The device that will generate the embeddings.
    """
    inputs = tokenizer(code, return_tensors="pt")
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state

    return embeddings.mean(dim=1)


def get_embeddings(x_train, x_test, tokenizer, model, device):
    x_train_embeddings = []
    for code in x_train:
        inputs = tokenizer(code, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
        x_train_embeddings.append(embeddings)

    x_test_embeddings = []
    for code in x_test:
        inputs = tokenizer(code, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
        x_test_embeddings.append(embeddings)

    return np.array(x_train_embeddings), np.array(x_test_embeddings)


def train_classifier(classifier: RandomForestClassifier, x_train, y_train):
    classifier.fit(x_train, y_train)


def classify_code(classifier: RandomForestClassifier, embeddings: list):
    """
    Classifies the code provided.

    :param classifier: The classifier used.
    :param embeddings: The embeddings of the code.
    """
    predicted_class_label = classifier.predict(embeddings)[0]

    return predicted_class_label


def generate_comments(
    code: str,
    tokenizer: AutoTokenizer,
    model: AutoModel,
    max_length: int = 500,
):
    """
    Generates comments for a given code snippet.

    :param code: The code for which the comments will be generated.
    :param tokenizer: Tokenizer for tokenizing the code provided.
    :param model: The model that will generate comments for the code.
    :return: The code with generated comments.
    """
    prompt = f"""
    You are a software specification generator. Given the following code snippet, generate a detailed specification comment that includes:
    1. Purpose of the code.
    2. Short description of the logic of each function and method.
    3. Description of the input parameters and the output.
    4. Include complexity analysis if possible.
    
    Code Snippet:
    {code}
    
    Specification:
    """

    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=0.7,  # adjust for diversity
        do_sample=True,
        repetition_penalty=2.0,
        num_return_sequences=1,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id,
    )

    comments = tokenizer.decode(output[0], skip_special_tokens=True)
    return comments

def generate_from_request(request: str, tokenizer: AutoTokenizer, model: AutoModel, max_length=500):
    prompt = f"""
    You are a code generator. Given the following request, generate a code written in Python that will solve the request.
    
    Request:
    {request}
    
    Code:
    """
    
    input_ids = tokenizer.encode(request, return_tensors='pt')
    output_ids = model.generate(
        input_ids,
        max_length=max_length,
        temperature=0.7,  # adjust for diversity
        do_sample=True,
        repetition_penalty=2.0,
        num_return_sequences=1,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id,
    )
    code = tokenizer.decode(output_ids[0])
    
    return code

In [20]:
code = load_code()

tokenizer = AutoTokenizer.from_pretrained("Daoguang/PyCodeGPT")
model = AutoModel.from_pretrained("Daoguang/PyCodeGPT")
device = torch.device('cpu')

data = load_dataset("data/dataset.csv")
labels = data['label']

x_train, x_test, y_train, y_test = split_data(data, test_size=0.1)
x_train, x_test = get_embeddings(x_train, x_test, tokenizer, model, device)
classifier = RandomForestClassifier()

train_classifier(classifier, x_train, y_train)

print(f"Base code:\n {code}\n\n")
print(f"Embeddings: {generate_embeddings(code, tokenizer, model, device)}")
print(f"Class: {classify_code(classifier, generate_embeddings(code, tokenizer, model, device))}\n\n")

model_lm = GPTNeoForCausalLM.from_pretrained("Daoguang/PyCodeGPT")
print(f"Code along with comments:\n {generate_comments(code, tokenizer, model_lm)}")

request = load_request()
print(f"Request:\n {request}\n\n")
print(f"Code of request:\n {generate_from_request(request, tokenizer, model_lm, max_length=1024)}")

Base code:
 def f(text: str):
    words = text.split(" ")

    word = words[0]

    for i in range(1, len(words)):
        if words[i] > word:
            word = words[i]

    return word


Embeddings: tensor([[-6.2189e-01, -1.8493e+00,  1.7276e+00,  9.5247e-01,  7.5159e-01,
         -2.3681e-01, -1.7968e+00, -1.1511e-01,  1.4903e+00,  5.5977e-01,
         -9.1783e-01, -5.5690e-01, -6.6448e-01,  4.1554e+00,  5.5137e-01,
         -9.3710e-01,  3.3748e-02, -6.8097e-01, -2.0198e+00, -3.6352e-01,
          1.1896e+00,  2.8250e+00,  5.6498e-01,  1.2844e+00, -1.1044e+00,
          3.1458e-01,  4.7703e-01, -1.2139e+00,  6.9446e-01, -6.2384e-01,
         -1.0959e+00, -1.1198e+00, -1.4293e+00, -1.3017e+00,  5.6370e-01,
         -1.9252e+00,  8.8036e-02, -2.3315e+00,  1.1220e+00,  9.2053e-01,
          1.0554e+00, -3.9841e+00,  3.4206e+00,  2.0370e+00, -6.9422e-01,
         -1.2255e+00, -8.7527e-01,  3.7614e-01, -8.7595e-01,  1.9444e+00,
          4.1814e-01,  3.3840e-01, -1.3543e+00,  1.2393e+0



Code along with comments:
 
    You are a software specification generator. Given the following code snippet, generate a detailed specification comment that includes:
    1. Purpose of the code.
    2. Short description of the logic of each function and method.
    3. Description of the input parameters and the output.
    4. Include complexity analysis if possible.
    
    Code Snippet:
    def f(text: str):
    words = text.split(" ")

    word = words[0]

    for i in range(1, len(words)):
        if words[i] > word:
            word = words[i]

    return word
    
    Specification:
     - The first line will contain all information about it's definition (excluding basic keywords) or its constructor signature with any additional details as needed; more specifically name/class attributes like names must be included before keyword arguments on their own lines to provide support documentation
       Example usage is here :  class Foo {
           foo().a() # returns value
          