### Java --> Python Clone Detection and Code Search Analysis

In [1]:
import pandas as pd
import numpy as np

##### Data Configure

In [2]:
true_filename = "javapython_gptclonebench.csv"
false_filename = "javapython_gptclonebenchNonClone.csv"

##### Precision Analysis

In [3]:
def compute_precision(retrieved):
    if len(retrieved) == 0:
        return 0.0
    return retrieved['label'].sum() / len(retrieved)

##### Recall Analysis

In [4]:
def compute_recall(group, threshold):
    total_positives = group["label"].sum()
    if total_positives == 0:
        return 0.0
    true_positives = group[group["similarity"] >= threshold]["label"].sum()
    return true_positives / total_positives

##### F-1 Score Analysis

In [5]:
def compute_f1(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

#### MAP (Mean Average Precision)

In [6]:
def compute_map(group):
    ap_sum = 0.0
    correct = 0
    total_positives = group["label"].sum()
    if total_positives == 0:
        return 0.0

    for idx, row in group.iterrows():
        if row["label"] == 1:
            correct += 1
            ap_sum += correct / (idx + 1)
    return ap_sum / total_positives

##### MRR (Mean Reciprocal Ranking)

In [7]:
def compute_mrr(group):
    for idx, row in group.iterrows():
        if row["label"] == 1:
            return 1 / (idx + 1)
    return 0.0

##### Top@K Precision

In [8]:
def compute_top_k_precision(group, k):
    top_k = group.head(k)
    return top_k['label'].sum() / k if len(top_k) >= k else np.nan

##### Evaluating Ranking Metrics

In [None]:
def evaluate_ranking_metrics(true_df, false_df, model_name="final_similarity", threshold=0.5):
    true_df = true_df.copy()
    true_df["label"] = 1
    true_df["origin"] = "true"

    false_df = false_df.copy()
    false_df["label"] = 0
    false_df["origin"] = "false"

    # Combine both DataFrames
    df = pd.concat([true_df, false_df], ignore_index=True)

    # Normalize column names and rename target column
    df.columns = [col.lower() for col in df.columns]
    df = df.rename(columns={"filename": "src_file", model_name: "similarity"})

    # Keep only src_files that have at least one true (label == 1) sample
    true_src_files = df[df["origin"] == "true"]["src_file"].unique()
    df = df[df["src_file"].isin(true_src_files)].copy()

    results = []

    for src_file, group in df.groupby("src_file"):
        group = group.sort_values(by="similarity", ascending=False).reset_index(drop=True)
        retrieved = group[group["similarity"] >= threshold]

        precision = compute_precision(retrieved)
        recall = compute_recall(group, threshold)
        f1 = compute_f1(precision, recall)
        map_score = compute_map(group)
        mrr = compute_mrr(group)

        # Top@K Precision
        top1 = compute_top_k_precision(group, 1)
        top2 = compute_top_k_precision(group, 2)
        top3 = compute_top_k_precision(group, 3)

        results.append({
            "src_file": src_file,
            "total": len(group),
            "positives": group['label'].sum(),
            "retrieved": len(retrieved),
            "true_positives": retrieved['label'].sum(),
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "map": map_score,
            "mrr": mrr,
            "top@1": top1,
            "top@2": top2,
            "top@3": top3
        })

    return pd.DataFrame(results)


##### Summarizing the Group analysis result

In [10]:
def summarize_metrics(per_group_df):
    macro_precision = per_group_df["precision"].mean()
    macro_recall = per_group_df["recall"].mean()
    macro_f1 = per_group_df["f1_score"].mean()

    # Micro: based on sum of TP, FP, FN
    total_tp = per_group_df["true_positives"].sum()
    total_retrieved = per_group_df["retrieved"].sum()
    total_positives = per_group_df["positives"].sum()

    micro_precision = total_tp / total_retrieved if total_retrieved else 0
    micro_recall = total_tp / total_positives if total_positives else 0
    micro_f1 = (
        2 * micro_precision * micro_recall / (micro_precision + micro_recall)
        if (micro_precision + micro_recall) > 0
        else 0
    )

    summary = {
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
        "mean_map": per_group_df["map"].mean(),
        "mean_mrr": per_group_df["mrr"].mean(),
        "mean_top@1": per_group_df["top@1"].mean(),
        "mean_top@2": per_group_df["top@2"].mean(),
        "mean_top@3": per_group_df["top@3"].mean(),
    }

    return pd.DataFrame([summary])


### GPT Intent Similarity based Clone detection and Code search ranking

In [11]:
true_df = pd.read_csv(true_filename, delimiter = ',')
false_df = pd.read_csv(false_filename, delimiter = ',')
model_name = "gpt_intent_similarity"

##### Threshold 50%

In [12]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.5)
result_df = summarize_metrics(group_df)

###### Precision

In [13]:
print("Macro Precision", result_df["macro_precision"])
print("Micro Precision", result_df["micro_precision"])

Macro Precision 0    0.977655
Name: macro_precision, dtype: float64
Micro Precision 0    1.0
Name: micro_precision, dtype: float64


###### Recall

In [14]:
print("Macro Recall", result_df["macro_recall"])
print("Micro Recall", result_df["micro_recall"])

Macro Recall 0    0.977655
Name: macro_recall, dtype: float64
Micro Recall 0    0.977655
Name: micro_recall, dtype: float64


###### F1-Score

In [15]:
print("Macro F1: ", result_df["macro_f1"])
print("Micro F1: ", result_df["micro_f1"])

Macro F1:  0    0.977655
Name: macro_f1, dtype: float64
Micro F1:  0    0.988701
Name: micro_f1, dtype: float64


###### Mean Average Precision

In [16]:
print("Mean Average Precision: ", result_df["mean_map"])

Mean Average Precision:  0    1.0
Name: mean_map, dtype: float64


###### Mean Reciprocal Ranking

In [17]:
print("Mean MRR: ", result_df["mean_mrr"])

Mean MRR:  0    1.0
Name: mean_mrr, dtype: float64


###### top@K ranking

In [18]:
print("top 1: ", result_df["mean_top@1"])
print("top 2: ", result_df["mean_top@2"])
print("top 3: ", result_df["mean_top@3"])

top 1:  0    1.0
Name: mean_top@1, dtype: float64
top 2:  0   NaN
Name: mean_top@2, dtype: float64
top 3:  0   NaN
Name: mean_top@3, dtype: float64


###### Result Evaluation

In [19]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.977655      0.977655  0.977655              1.0      0.977655   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.988701       1.0       1.0         1.0         NaN         NaN  


###### reason of low mean_top@3

In [20]:
#def log_low_top3_cases(metrics_df, threshold=1.0):
#    low_top3 = metrics_df[metrics_df["top@3"] < threshold].copy()
#    return low_top3.sort_values(by="top@3")

#low_top3_df = log_low_top3_cases(group_df, threshold=0.99)
#low_top3_df.to_csv("sample.csv")


#### Threshold 65%

In [21]:
group_df= evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.65)
result_df = summarize_metrics(group_df)

###### Result evaluation

In [22]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.952073      0.952073  0.952073              1.0      0.952073   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.975448       1.0       1.0         1.0         NaN         NaN  


### Threshold 80%

In [23]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.80)
result_df = summarize_metrics(group_df)

###### Result Evaluation

In [24]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.893782      0.893782  0.893782              1.0      0.893782   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.943912       1.0       1.0         1.0         NaN         NaN  


### GPT Semantic Similarity based Clone detection and Code search ranking

In [25]:
true_df = pd.read_csv(true_filename, delimiter = ',')
false_df = pd.read_csv(false_filename, delimiter = ',')
model_name = "gpt_semantic_similarity"

##### Threshold 50%

In [26]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.5)
result_df = summarize_metrics(group_df)

###### Evalation Result

In [27]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.978627      0.978627  0.978627              1.0      0.978627   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.989198       1.0       1.0         1.0         NaN         NaN  


##### Threshold 65%

In [28]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.65)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [29]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.943005      0.943005  0.943005              1.0      0.943005   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.970667       1.0       1.0         1.0         NaN         NaN  


##### Threshold 80%

In [30]:
group_df  = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.8)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [31]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.816386      0.816386  0.816386              1.0      0.816386   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.898912       1.0       1.0         1.0         NaN         NaN  


### Attention-VAE model Similarity based Clone detection and Code search ranking

In [32]:
true_df = pd.read_csv(true_filename, delimiter = ',')
false_df = pd.read_csv(false_filename, delimiter = ',')
model_name = "attn_vae_similarity"

##### Threshold 50%

In [33]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.5)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [34]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.489961      0.489961  0.489961              1.0      0.489961   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.657683       1.0       1.0         1.0         NaN         NaN  


##### Threshold 65%

In [35]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.65)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [36]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.452073      0.452073  0.452073              1.0      0.452073   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.622658       1.0       1.0         1.0         NaN         NaN  


##### Threshold 50%

In [37]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.8)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [38]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.279469      0.279469  0.279469              1.0      0.279469   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.436851       1.0       1.0         1.0         NaN         NaN  


### XLCoCo : Clone Detection and Code Search Performance

In [39]:
true_df = pd.read_csv(true_filename, delimiter = ',')
false_df = pd.read_csv(false_filename, delimiter = ',')
model_name = "final_similarity"

##### Threshold 50%

In [40]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.5)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [41]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0          0.98737       0.98737   0.98737              1.0       0.98737   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.993645       1.0       1.0         1.0         NaN         NaN  


##### Threshold 65%

In [42]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.65)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [43]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0          0.98737       0.98737   0.98737              1.0       0.98737   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.993645       1.0       1.0         1.0         NaN         NaN  


##### Threshold 80%

In [44]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.8)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [45]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.986723      0.986723  0.986723              1.0      0.986723   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.993317       1.0       1.0         1.0         NaN         NaN  
