In [1]:
import javalang
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import json
import re

In [2]:
path = "./train_data.json"

with open(path, 'r') as f:
        json_data = json.load(f)


In [3]:
data = pd.DataFrame(json_data)

In [4]:
data = data.rename(columns={'code': 'code', 'nl': 'comment'})

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10168 entries, 0 to 10167
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   code     10168 non-null  object
 1   comment  10168 non-null  object
dtypes: object(2)
memory usage: 159.0+ KB


In [6]:
data['code'] = data['code'].apply(lambda x: "public class Main\n{\n" + x + "}")

In [7]:
def tokenize_code(code):
    tokens = javalang.tokenizer.tokenize(code)
    t = []
    # Tüm tokenleri döngü ile dönebiliriz
    for token in tokens:
        t.append(token.value)
    return t

In [8]:
data_token = []

In [9]:
for d in data["code"]:
    token = tokenize_code(d)
    data_token.append(token)

In [10]:
model = Word2Vec(sentences=list(data_token), vector_size=100, window=10, min_count=1, workers=4)


In [11]:
# Save the trained model
model.save("code_word2vec.model")

In [12]:
# Load the trained model
model = Word2Vec.load("code_word2vec.model")


In [13]:
def generate_embeddings(tokens):
    embeddings = [model.wv[token] for token in tokens]
    return np.mean(embeddings, axis=0)

In [14]:
data["tokens"] = data_token

In [15]:
data["code_embedding"] = data["tokens"].apply(generate_embeddings)

In [16]:
def extract_features(code):
    tree = javalang.parse.parse(code)
    
    function_calls = set()
    parameter_count = 0
    method_name = None
    is_constructor = False
    return_type = None
    
    for _, node in tree.filter(javalang.tree.MethodInvocation):
        function_calls.add(node.member)

    for _, node in tree.filter(javalang.tree.ConstructorDeclaration):
        is_constructor = True
        method_name = node.name
        parameter_count = len(node.parameters)
        
    for _, node in tree.filter(javalang.tree.MethodDeclaration):
        method_name = node.name
        parameter_count = max(len(node.parameters), parameter_count)
        return_type = node.return_type.name if node.return_type else None
        
    return function_calls, is_constructor, parameter_count, method_name, return_type

In [17]:
def extract(code):
    function_calls, is_constructor, parameter_count, method_name, return_type = extract_features(code)
    return function_calls, is_constructor, parameter_count, method_name, return_type

In [18]:
data[["function_calls", "is_constructor", "parameter_count", "method_name", "return_type"]] = data["code"].apply(
    lambda x: pd.Series(extract(x)))


In [19]:
method_name_lengths = list(map(lambda x: len(x) if x is not None else float('nan'), data["method_name"]))
data["method_name_length"] = pd.Series(method_name_lengths)

In [20]:
return_type_mapping = {'void': 0, 'int': 1, 'float': 2, 'double': 3, 'boolean': 4, 'String': 5, 'custom': 6}

In [21]:
def map_return_type(return_type):
    if return_type is None:
        return None
    elif return_type in return_type_mapping:
        return return_type_mapping[return_type]
    else:
        return 7

In [22]:
data["return_type_numeric"] = data['return_type'].apply(map_return_type)

In [None]:
def split_camel_case(s):
    # CamelCase'deki büyük harflerden önce boşluk ekleyerek ayrıştırır
    s = re.sub('([a-z0-9])([A-Z])', r'\1 \2', s)
    # '_' işareti yerine boşluk ekleyerek ayrıştırır
    s = s.replace('_', ' ')
    # Ayrıştırılmış kelimeyi döndürür
    return s

In [None]:
data["method_name_string"] = data["method_name"].apply(split_camel_case)

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10168 entries, 0 to 10167
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   code                 10168 non-null  object 
 1   comment              10168 non-null  object 
 2   tokens               10168 non-null  object 
 3   code_embedding       10168 non-null  object 
 4   function_calls       10168 non-null  object 
 5   is_constructor       10168 non-null  bool   
 6   parameter_count      10168 non-null  int64  
 7   method_name          10168 non-null  object 
 8   return_type          4615 non-null   object 
 9   method_name_length   10168 non-null  int64  
 10  return_type_numeric  4615 non-null   float64
dtypes: bool(1), float64(1), int64(2), object(7)
memory usage: 804.4+ KB


In [24]:
data.to_csv("./preprocessed_dataset_with_custom_embeddings.csv", index=False)