In [18]:
import pandas as pd
import json
import os

DATA_DIR = "/data/ncc_data"
# download data and extract it in raw dir
CODE_SEARCH_NET_FENG_DIR = os.path.join(DATA_DIR, "code_search_net_feng", "raw") 

# LANGUAGE = ["go", "java", "javascript", "php", "python", "ruby"]
# We use Java language
LANGUAGE = ["java"]

DATA_SPLIT = ["valid", "test", "train"]

def temp_format(x):
    return "{:,}".format(x)

def add_table_star(table_path: str):
    with open(table_path, "r") as f:
        file_data = f.read()

    file_data = file_data.replace("\\begin{table}", "\\begin{table*}")
    file_data = file_data.replace("\\end{table}", "\\end{table*}")

    with open(table_path, "w") as f:
        f.write(file_data)

def change_table_font(table_path: str):
    with open(table_path, "r") as f:
        file_data = f.read()

    file_data = file_data.replace("\\centering", "\\centering\n\\small")

    with open(table_path, "w") as f:
        f.write(file_data)

def count_data(language_list, split_list):
    table = []
    
    for lang in language_list:
        for split in split_list:
            data_len = pd.DataFrame()
            data_file = os.path.join(CODE_SEARCH_NET_FENG_DIR, lang, f"{split}.jsonl")
            with open(data_file) as f:
                data = pd.read_json(f, lines=True)

                data_len["Code Length"] = data["code_tokens"].apply(len)
                data_len["Doc Length"] = data["docstring_tokens"].apply(len)
                description = data_len.describe()
                description = description.T
                description["split"] = split
                description.index.name = 'type'
                description = description.reset_index()
                table.append(description)
    final_table = pd.concat(table)

    
    return final_table


if __name__ == "__main__":
    """
    python -m data.draw_table
    """
    # data = count_data()
    table = count_data(LANGUAGE, DATA_SPLIT)
    display(table)

    table[["count", "min", "25%", "50%", "75%", "max"]] = table[["count", "min", "25%", "50%", "75%", "max"]].astype("int64")

    table["count"] = table["count"].map(lambda x: format(x, ","))
    multi_index = pd.MultiIndex.from_frame(table[["split", "count", "type"]], names=["Data Split", "Count", "Type"])
    table.drop(["split", "count", "type"], axis=1, inplace=True)
    table.index = multi_index

    columns = ["Mean", "Std.", "Minimum", "1st Qua.", "Median", "3rd Qua.", "Maximum"]
    table.columns = columns
    display(table)





Unnamed: 0,type,count,mean,std,min,25%,50%,75%,max,split
0,Code Length,5183.0,88.684353,74.809153,18.0,40.0,61.0,108.0,500.0,valid
1,Doc Length,5183.0,13.390122,10.158273,3.0,7.0,11.0,16.0,147.0,valid
0,Code Length,10955.0,97.811684,80.573228,20.0,45.0,69.0,120.0,511.0,test
1,Doc Length,10955.0,12.706162,9.588451,3.0,7.0,10.0,15.0,111.0,test
0,Code Length,164923.0,98.681524,82.89747,17.0,44.0,69.0,121.0,512.0,train
1,Doc Length,164923.0,13.251778,10.028045,3.0,7.0,10.0,16.0,173.0,train


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Mean,Std.,Minimum,1st Qua.,Median,3rd Qua.,Maximum
Data Split,Count,Type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
valid,5183,Code Length,88.684353,74.809153,18,40,61,108,500
valid,5183,Doc Length,13.390122,10.158273,3,7,11,16,147
test,10955,Code Length,97.811684,80.573228,20,45,69,120,511
test,10955,Doc Length,12.706162,9.588451,3,7,10,15,111
train,164923,Code Length,98.681524,82.89747,17,44,69,121,512
train,164923,Doc Length,13.251778,10.028045,3,7,10,16,173


In [19]:


latex = table.to_latex(
    float_format="%.2f",
    label="tab:count_data",
    caption="Data count and statistics (the minimum, the first quarter (25\%), and median (50\%), the third quarter (75\%), and the maximum number) of code tokens and documentation words for each data split.",
)

table_dir = "/mnt/c/Users/38013/OneDrive/Paper/LatexParameterDescriptionGeneration/tables"
table_path = os.path.join(table_dir, "table_count_data.tex")

with open(table_path, "w") as f:
    f.write(latex)

add_table_star(table_path)
print(latex)
print(table_path)

\begin{table}
\centering
\caption{Data count and statistics of code tokens and documentation tokens for each data split.}
\label{tab:count_data}
\begin{tabular}{lllrrrrrrr}
\toprule
      &         &            &  Mean &  Std. &  Minimum &  1st Qua. &  Median &  3rd Qua. &  Maximum \\
Data Split & Count & Type &       &       &          &           &         &           &          \\
\midrule
valid & 5,183 & Code Length & 88.68 & 74.81 &       18 &        40 &      61 &       108 &      500 \\
      &         & Doc Length & 13.39 & 10.16 &        3 &         7 &      11 &        16 &      147 \\
test & 10,955 & Code Length & 97.81 & 80.57 &       20 &        45 &      69 &       120 &      511 \\
      &         & Doc Length & 12.71 &  9.59 &        3 &         7 &      10 &        15 &      111 \\
train & 164,923 & Code Length & 98.68 & 82.90 &       17 &        44 &      69 &       121 &      512 \\
      &         & Doc Length & 13.25 & 10.03 &        3 &         7 &      10 &      

  latex = table.to_latex(


In [143]:
from IPython.display import display
# Valid Data
data_file = os.path.join(CODE_SEARCH_NET_FENG_DIR, f"java/test.jsonl")
with open(data_file) as f:
    valid_data = pd.read_json(f, lines=True)
display(valid_data)


Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition
0,ReactiveX/RxJava,src/main/java/io/reactivex/internal/observers/...,QueueDrainObserver.fastPathOrderedEmit,protected final void fastPathOrderedEmit(U val...,java,protected final void fastPathOrderedEmit(U val...,"[protected, final, void, fastPathOrderedEmit, ...",Makes sure the fast-path emits in order.\n@par...,"[Makes, sure, the, fast, -, path, emits, in, o...",ac84182aa2bd866b53e01c8e3fe99683b882c60e,https://github.com/ReactiveX/RxJava/blob/ac841...,test
1,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,Observable.wrap,@CheckReturnValue\n @SchedulerSupport(Sched...,java,@CheckReturnValue\n @SchedulerSupport(Sched...,"[@, SchedulerSupport, (, SchedulerSupport, ., ...",Wraps an ObservableSource into an Observable i...,"[Wraps, an, ObservableSource, into, an, Observ...",ac84182aa2bd866b53e01c8e3fe99683b882c60e,https://github.com/ReactiveX/RxJava/blob/ac841...,test
2,ReactiveX/RxJava,src/main/java/io/reactivex/Observable.java,Observable.sorted,@CheckReturnValue\n @SchedulerSupport(Sched...,java,@CheckReturnValue\n @SchedulerSupport(Sched...,"[@, SchedulerSupport, (, SchedulerSupport, ., ...",Returns an Observable that emits the events em...,"[Returns, an, Observable, that, emits, the, ev...",ac84182aa2bd866b53e01c8e3fe99683b882c60e,https://github.com/ReactiveX/RxJava/blob/ac841...,test
3,ReactiveX/RxJava,src/main/java/io/reactivex/internal/operators/...,ObservableReplay.observeOn,public static <T> ConnectableObservable<T> obs...,java,public static <T> ConnectableObservable<T> obs...,"[public, static, <, T, >, ConnectableObservabl...",Child Observers will observe the events of the...,"[Child, Observers, will, observe, the, events,...",ac84182aa2bd866b53e01c8e3fe99683b882c60e,https://github.com/ReactiveX/RxJava/blob/ac841...,test
4,ReactiveX/RxJava,src/main/java/io/reactivex/processors/UnicastP...,UnicastProcessor.create,@CheckReturnValue\n @NonNull\n public st...,java,@CheckReturnValue\n @NonNull\n public st...,"[public, static, <, T, >, UnicastProcessor, <,...",Creates an UnicastProcessor with the given int...,"[Creates, an, UnicastProcessor, with, the, giv...",ac84182aa2bd866b53e01c8e3fe99683b882c60e,https://github.com/ReactiveX/RxJava/blob/ac841...,test
...,...,...,...,...,...,...,...,...,...,...,...,...
10950,ieb/sparsemapcontent,core/src/main/java/org/sakaiproject/nakamura/l...,Types.loadFromStream,"public static void loadFromStream(String key, ...",java,"public static void loadFromStream(String key, ...","[public, static, void, loadFromStream, (, Stri...",Load a Map from binary stream\n\n@param output...,"[Load, a, Map, from, binary, stream]",0570fdf868adbbf7734790cbc09fe66480e5f2dc,https://github.com/ieb/sparsemapcontent/blob/0...,test
10951,intellimate/IzouSDK,src/main/java/org/intellimate/izou/sdk/addon/A...,AddOn.register,@Override\n public void register() {\n ...,java,@Override\n public void register() {\n ...,"[public, void, register, (, ), {, prepare, (, ...",This method is used to register the modules,"[This, method, is, used, to, register, the, mo...",bc8705ad48a6ca12a722f2b787be435949fa5d08,https://github.com/intellimate/IzouSDK/blob/bc...,test
10952,intellimate/IzouSDK,src/main/java/org/intellimate/izou/sdk/addon/A...,AddOn.initAddOn,@Override\n public void initAddOn(org.intel...,java,@Override\n public void initAddOn(org.intel...,"[public, void, initAddOn, (, org, ., intellima...",Internal initiation of addOn - fake constructo...,"[Internal, initiation, of, addOn, -, fake, con...",bc8705ad48a6ca12a722f2b787be435949fa5d08,https://github.com/intellimate/IzouSDK/blob/bc...,test
10953,abmargb/jamppa,src/main/java/org/jivesoftware/smack/Reconnect...,ReconnectionManager.notifyReconnectionFailed,protected void notifyReconnectionFailed(Except...,java,protected void notifyReconnectionFailed(Except...,"[protected, void, notifyReconnectionFailed, (,...",Fires listeners when a reconnection attempt ha...,"[Fires, listeners, when, a, reconnection, atte...",76f253239923df40904c462e3b88e7278b36b282,https://github.com/abmargb/jamppa/blob/76f2532...,test


In [111]:
data = valid_data
data["code_len"] = data["code_tokens"].apply(len)
data["doc_len"] = data["docstring_tokens"].apply(len)
count_code_len = data["code_len"].describe()
count_doc_len = data["doc_len"].describe()
df = data.describe()
display(df)

Unnamed: 0,code_len,doc_len
count,5183.0,5183.0
mean,88.684353,13.390122
std,74.809153,10.158273
min,18.0,3.0
25%,40.0,7.0
50%,61.0,11.0
75%,108.0,16.0
max,500.0,147.0


In [128]:
table = df.copy()
table.columns = pd.MultiIndex.from_product([["Valid"], df.columns])

# display(table.T)
pd.concat([table, table], axis=1)

Unnamed: 0_level_0,Valid,Valid,Valid,Valid
Unnamed: 0_level_1,code_len,doc_len,code_len.1,doc_len.1
count,5183.0,5183.0,5183.0,5183.0
mean,88.684353,13.390122,88.684353,13.390122
std,74.809153,10.158273,74.809153,10.158273
min,18.0,3.0,18.0,3.0
25%,40.0,7.0,40.0,7.0
50%,61.0,11.0,61.0,11.0
75%,108.0,16.0,108.0,16.0
max,500.0,147.0,500.0,147.0
