In [13]:
from java.java8.JavaLexer import JavaLexer
from antlr4 import *

def count_java_tokens_antlr4(code):
    lexer = JavaLexer(InputStream(code))
    token_count = 0
    for token in lexer.getAllTokens():
        token_count += 1
    return token_count

In [20]:
def cr_and_num_token(dataset_url: str, compile_info_col: str):
    import pandas as pd
    import numpy as np
    df = pd.read_parquet(dataset_url, "fastparquet")
    df["len_func_body"] = df["func_body"].apply(lambda func: count_java_tokens_antlr4(func))
    print(df["len_func_body"].describe())
    from collections import defaultdict
    buckets = [(3, 10), (10, 33), (33, 50), (50, 102), (102, 200), (200, 400), (400, 800), (800, 1300)]
    stats = {}
    for _, row in df.iterrows():
        for bucket in buckets:
            if row["len_func_body"] >= bucket[0] and row["len_func_body"] < bucket[1]:
                tmp = stats.get(bucket, [0, 0])
                tmp[0] += 1
                if row[compile_info_col] == "<COMPILED_SUCCESSFULLY>":
                    tmp[1] += 1
                stats[bucket] = tmp
                break
    import matplotlib.pyplot as plt

    # Data
    intervals = [str(bucket) for bucket in buckets]
    compilables = [stats[bucket][1] for bucket in buckets]
    totals = [stats[bucket][0] for bucket in buckets]
    percentages = [stats[bucket][1] / stats[bucket][0] * 100 for bucket in buckets]
    # Plot
    fig, ax1 = plt.subplots(figsize=(12, 6))
    ax2 = ax1.twinx()

    bar_width = 0.35
    x = np.arange(len(intervals))

    bars1 = ax1.bar(x - bar_width/2, compilables, bar_width, color='skyblue', label='#Compilable')
    bars2 = ax1.bar(x + bar_width/2, totals, bar_width, color='orange', label='#Func')
    # Add percentages above each column
    for bar, compilable in zip(bars1, compilables):
        ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, compilable, ha='center')

    # Add values above each column in the additional column
    for bar, value in zip(bars2, totals):
        ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, value, ha='center', color='black')
    
    ax2.plot(x, percentages, marker='o', color='blue', label='Percentage', linestyle='-')
    for i, percentage in enumerate(percentages):
        ax2.text(i, percentage, f'{percentage:.2f}%', ha='right', va='bottom')
        
    ax1.set_xlabel('Function Body Length')
    ax1.set_ylabel('Compilable / Num_Func')
    ax2.set_ylabel('Percentage')
    plt.title('Distribution')
    ax1.set_xticks(x, intervals, rotation=45, ha='right')
    ax1.legend(loc="center right")
    ax1.set_ylim(0, max(max(compilables), max(totals)) + 100)
    ax2.set_ylim(0, 100)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [21]:
cr_and_num_token("/home/hieuvd/lvdthieu/CodeGen/java_data/data/compile_info_codellama_v1.parquet", "compile_info_filled_file_baseline_output")

line 2:67 token recognition error at: '";\n'
line 14:35 token recognition error at: ''''
line 14:37 token recognition error at: '')"'
line 24:31 token recognition error at: ''''
line 24:33 token recognition error at: '')"'
line 7:46 token recognition error at: '");\n'
line 2:36 token recognition error at: ''ab'
line 2:40 token recognition error at: ''")'
line 2:43 token recognition error at: '";\n'
line 19:23 token recognition error at: ''ab'
line 19:27 token recognition error at: ''""'
line 2:72 token recognition error at: ''''
line 2:74 token recognition error at: '' +'
line 2:100 token recognition error at: ''''
line 2:102 token recognition error at: ''\n'
line 3:44 token recognition error at: ''''
line 3:46 token recognition error at: '' +'
line 3:68 token recognition error at: ''''
line 3:70 token recognition error at: '' +'
line 2:95 token recognition error at: ''''
line 2:97 token recognition error at: '' +'
line 3:38 token recognition error at: ''''
line 3:40 token recognition 

count    3629.000000
mean       79.934417
std       118.195870
min         3.000000
25%        10.000000
50%        33.000000
75%       102.000000
max      1298.000000
Name: len_func_body, dtype: float64


line 4:46 token recognition error at: ''''
line 4:48 token recognition error at: '' +'
line 5:42 token recognition error at: ''''
line 5:44 token recognition error at: '' +'
line 3:40 token recognition error at: ''''
line 3:42 token recognition error at: '' +'
line 4:46 token recognition error at: ''''
line 4:48 token recognition error at: '' +'
line 3:38 token recognition error at: ''''
line 3:40 token recognition error at: '' +'
line 12:56 token recognition error at: ''''
line 12:58 token recognition error at: '' +'
line 13:58 token recognition error at: ''''
line 13:60 token recognition error at: '' +'
line 3:34 token recognition error at: ''''
line 3:36 token recognition error at: '' +'
line 4:36 token recognition error at: ''''
line 4:38 token recognition error at: '' +'
line 4:52 token recognition error at: ''''
line 4:54 token recognition error at: '' +'
line 4:36 token recognition error at: ''''
line 4:38 token recognition error at: '' +'
line 5:52 token recognition error at: '

In [1]:
def check_compilable_rate(dataset_url: str, compile_info_col: str="compile_info_filled_file_baseline_output") -> int:
    import pandas as pd
    df = pd.read_parquet(dataset_url, "fastparquet")
    return len(df[df[compile_info_col] == "<COMPILED_SUCCESSFULLY>"]) / len(df)

In [4]:
print(check_compilable_rate("/home/hieuvd/lvdthieu/CodeGen/java_data/data/compile_info_deepseek_v2.parquet", "compile_info_filled_file_finetune_output"))

0.9754753375585561


In [5]:
with open("/home/hieuvd/lvdthieu/CodeGen/java_data/data/java_file.txt", "r") as f:
    java_files = f.read().split('\n')

print(len(java_files))

32491


In [6]:
import random

random_java_files = random.choices(java_files, k=1000)

In [8]:
with open("/home/hieuvd/lvdthieu/CodeGen/java_data/data/random_java_file.txt", "w") as f:
    for jv in random_java_files:
        f.write(jv + '\n')


In [18]:
import pandas as pd

df = pd.read_parquet("/home/hieuvd/lvdthieu/CodeGen/java_data/data/special_dataset.parquet")
df["len_func_body"] = df["func_body"].apply(lambda code: count_java_tokens_antlr4(code))

In [16]:
df

Unnamed: 0,proj_name,relative_path,class_name,func_name,masked_class,func_body,len_func_body
0,flowable_flowable-engine,flowable-engine/modules/flowable-cmmn-rest/src...,PlanItemInstanceCollectionResource,getPlanItemInstances,class PlanItemInstanceCollectionResource exten...,\n // Populate query based on request\n...,579
1,TheAlgorithms_Java,Java/src/main/java/com/thealgorithms/dynamicpr...,KnapsackMemoization,knapSack,class KnapsackMemoization {\n\n int knapSac...,\n\n // Declare the table dynamically\n...,130
2,stanfordnlp_CoreNLP,CoreNLP/src/edu/stanford/nlp/process/DocumentP...,XMLIterator,usage,class XMLIterator implements Iterator<List<Has...,\n StringBuilder sb = new StringBuilder();\...,201
3,TheAlgorithms_Java,Java/src/main/java/com/thealgorithms/misc/Sort...,Sort012D,sort012,class Sort012D {\n\n public static void mai...,\n int l = 0;\n int h = a.length...,237
4,stanfordnlp_CoreNLP,CoreNLP/test/src/edu/stanford/nlp/trees/intern...,ChineseUtilsTest,testNormalize,class ChineseUtilsTest extends TestCase {\n\n ...,"\n String input = ""Hello Ｅｎｇｌｉｓｈ - 你好\u300...",124
...,...,...,...,...,...,...,...
767,TheAlgorithms_Java,Java/src/main/java/com/thealgorithms/bitmanipu...,IsPowerTwo,isPowerTwo,class IsPowerTwo {\n public static boolean ...,\n if (number <= 0) {\n retu...,48
768,NLPchina_ansj_seg,ansj_seg/src/test/java/org/ansj/app/keyword/Ke...,KeyWordTest,setScore,class KeyWordTest {\n\n @Test\n public v...,"\n Keyword keyword = new Keyword(""jack""...",56
769,questdb_questdb,questdb/core/src/main/java/io/questdb/griffin/...,NullIfIntFunction,getInt,class NullIfIntFunction extends IntFunction im...,\n return intFunc1.getInt(rec) == i...,35
770,TheAlgorithms_Java,Java/src/main/java/com/thealgorithms/maths/Aut...,AutomorphicNumber,isAutomorphic,class AutomorphicNumber {\n\n /**\n * A...,\n if (n < 0) return false;\n lo...,110


In [19]:
df["len_func_body"].describe()

count     772.000000
mean      161.498705
std       297.727321
min         3.000000
25%        23.000000
50%        74.500000
75%       182.250000
max      3941.000000
Name: len_func_body, dtype: float64