### Java --> CSharp Clone Detection and Code Search Analysis

In [None]:
import pandas as pd
import numpy as np

##### Data Configure

In [None]:
true_filename = "JavaCSharpFeatures_baselines.csv"
false_filename = "JavaCSharpNonCloneFeatures_baselines.csv"

##### Precision Analysis

In [None]:
def compute_precision(retrieved):
    if len(retrieved) == 0:
        return 0.0
    return retrieved['label'].sum() / len(retrieved)

##### Recall Analysis

In [None]:
def compute_recall(group, threshold):
    total_positives = group["label"].sum()
    if total_positives == 0:
        return 0.0
    true_positives = group[group["similarity"] >= threshold]["label"].sum()
    return true_positives / total_positives

##### F-1 Score Analysis

In [None]:
def compute_f1(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

#### MAP (Mean Average Precision)

In [None]:
def compute_map(group):
    ap_sum = 0.0
    correct = 0
    total_positives = group["label"].sum()
    if total_positives == 0:
        return 0.0

    for idx, row in group.iterrows():
        if row["label"] == 1:
            correct += 1
            ap_sum += correct / (idx + 1)
    return ap_sum / total_positives

##### MRR (Mean Reciprocal Ranking)

In [None]:
def compute_mrr(group):
    for idx, row in group.iterrows():
        if row["label"] == 1:
            return 1 / (idx + 1)
    return 0.0

##### Top@K Precision

In [8]:
def compute_top_k_precision(group, k):
    top_k = group.head(k)
    return top_k['label'].sum() / k if len(top_k) >= k else np.nan

##### Evaluating Ranking Metrics

In [9]:
def evaluate_ranking_metrics(true_df, false_df, model_name="final_similarity", threshold=0.5):
    true_df = true_df.copy()
    true_df["label"] = 1
    true_df["origin"] = "true"

    false_df = false_df.copy()
    false_df["label"] = 0
    false_df["origin"] = "false"

    # Combine both DataFrames
    df = pd.concat([true_df, false_df], ignore_index=True)

    # Normalize column names and rename target column
    df.columns = [col.lower() for col in df.columns]
    df = df.rename(columns={"src_filename": "src_file", model_name: "similarity"})

    # Keep only src_files that have at least one true (label == 1) sample
    true_src_files = df[df["origin"] == "true"]["src_file"].unique()
    df = df[df["src_file"].isin(true_src_files)].copy()

    results = []

    for src_file, group in df.groupby("src_file"):
        group = group.sort_values(by="similarity", ascending=False).reset_index(drop=True)
        retrieved = group[group["similarity"] >= threshold]

        precision = compute_precision(retrieved)
        recall = compute_recall(group, threshold)
        f1 = compute_f1(precision, recall)
        map_score = compute_map(group)
        mrr = compute_mrr(group)

        # Top@K Precision
        top1 = compute_top_k_precision(group, 1)
        top2 = compute_top_k_precision(group, 2)
        top3 = compute_top_k_precision(group, 3)

        results.append({
            "src_file": src_file,
            "total": len(group),
            "positives": group['label'].sum(),
            "retrieved": len(retrieved),
            "true_positives": retrieved['label'].sum(),
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "map": map_score,
            "mrr": mrr,
            "top@1": top1,
            "top@2": top2,
            "top@3": top3
        })

    return pd.DataFrame(results)


##### Summarizing the Group analysis result

In [10]:
def summarize_metrics(per_group_df):
    macro_precision = per_group_df["precision"].mean()
    macro_recall = per_group_df["recall"].mean()
    macro_f1 = per_group_df["f1_score"].mean()

    # Micro: based on sum of TP, FP, FN
    total_tp = per_group_df["true_positives"].sum()
    total_retrieved = per_group_df["retrieved"].sum()
    total_positives = per_group_df["positives"].sum()

    micro_precision = total_tp / total_retrieved if total_retrieved else 0
    micro_recall = total_tp / total_positives if total_positives else 0
    micro_f1 = (
        2 * micro_precision * micro_recall / (micro_precision + micro_recall)
        if (micro_precision + micro_recall) > 0
        else 0
    )

    summary = {
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1,
        "mean_map": per_group_df["map"].mean(),
        "mean_mrr": per_group_df["mrr"].mean(),
        "mean_top@1": per_group_df["top@1"].mean(),
        "mean_top@2": per_group_df["top@2"].mean(),
        "mean_top@3": per_group_df["top@3"].mean(),
    }

    return pd.DataFrame([summary])


### GPT Direct Prompt based Cross-Language Clone detection and Code search ranking

In [None]:
true_df = pd.read_csv(true_filename, delimiter = ',')
false_df = pd.read_csv(false_filename, delimiter = ',')
model_name = "gpt_simple_similarity"

FileNotFoundError: [Errno 2] No such file or directory: 'JavaCSharpNonCloneFeatures_baselines.csv'

##### Threshold 50%

In [None]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.5)
result_df = summarize_metrics(group_df)

###### Precision

In [None]:
print("Macro Precision", result_df["macro_precision"])
print("Micro Precision", result_df["micro_precision"])

Macro Precision 0    0.993449
Name: macro_precision, dtype: float64
Micro Precision 0    0.999003
Name: micro_precision, dtype: float64


###### Recall

In [None]:
print("Macro Recall", result_df["macro_recall"])
print("Micro Recall", result_df["micro_recall"])

Macro Recall 0    0.973686
Name: macro_recall, dtype: float64
Micro Recall 0    0.971568
Name: micro_recall, dtype: float64


###### F1-Score

In [None]:
print("Macro F1: ", result_df["macro_f1"])
print("Micro F1: ", result_df["micro_f1"])

Macro F1:  0    0.980428
Name: macro_f1, dtype: float64
Micro F1:  0    0.985094
Name: micro_f1, dtype: float64


###### Mean Average Precision

In [None]:
print("Mean Average Precision: ", result_df["mean_map"])

Mean Average Precision:  0    0.998958
Name: mean_map, dtype: float64


###### Mean Reciprocal Ranking

In [None]:
print("Mean MRR: ", result_df["mean_mrr"])

Mean MRR:  0    0.999546
Name: mean_mrr, dtype: float64


###### top@K ranking

In [None]:
print("top 1: ", result_df["mean_top@1"])
print("top 2: ", result_df["mean_top@2"])
print("top 3: ", result_df["mean_top@3"])

top 1:  0    0.999319
Name: mean_top@1, dtype: float64
top 2:  0    0.914931
Name: mean_top@2, dtype: float64
top 3:  0    0.856177
Name: mean_top@3, dtype: float64


###### Result Evaluation

In [None]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.993449      0.973686  0.980428         0.999003      0.971568   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.985094  0.998958  0.999546    0.999319    0.914931    0.856177  


###### reason of low mean_top@3

In [None]:
#def log_low_top3_cases(metrics_df, threshold=1.0):
#    low_top3 = metrics_df[metrics_df["top@3"] < threshold].copy()
#    return low_top3.sort_values(by="top@3")

#low_top3_df = log_low_top3_cases(group_df, threshold=0.99)
#low_top3_df.to_csv("sample.csv")


#### Threshold 65%

In [None]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.65)
result_df = summarize_metrics(group_df)

###### Result evaluation

In [None]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.988061      0.947097  0.961037         0.999612      0.942467   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.970199  0.998958  0.999546    0.999319    0.914931    0.856177  


### Threshold 80%

In [None]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.80)
result_df = summarize_metrics(group_df)

###### Result Evaluation

In [None]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.972746      0.900723  0.924871         0.999771      0.900757   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.947685  0.998958  0.999546    0.999319    0.914931    0.856177  


### GPT Reasoning based Cross_Language Clone detection and Code search ranking

In [None]:
true_df = pd.read_csv(true_filename, delimiter = ',')
false_df = pd.read_csv(false_filename, delimiter = ',')
model_name = "gpt_reasoning_similarity"

##### Threshold 50%

In [None]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.5)
result_df = summarize_metrics(group_df)

###### Evalation Result

In [None]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.989486      0.960386  0.971153         0.999387      0.959466   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0   0.97902  0.999108  0.999524    0.999047    0.914931    0.856177  


##### Threshold 65%

In [None]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.65)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [None]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.987112      0.938248  0.956229         0.999521      0.929173   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.963064  0.999108  0.999524    0.999047    0.914931    0.856177  


##### Threshold 80%

In [None]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.8)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [None]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.887308      0.706515   0.76616         0.999745      0.747432   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0   0.85537  0.999108  0.999524    0.999047    0.914931    0.856177  


### GPT_Seperate Explanation Similarity based Cross-Language Clone Detection

In [None]:
true_df = pd.read_csv(true_filename, delimiter = ',')
false_df = pd.read_csv(false_filename, delimiter = ',')
model_name = "gpt_seperate_explanation_similarity"

##### Threshold 50%

In [None]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.5)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [None]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0          0.76187      0.840523  0.783011         0.816099      0.928808   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.868813  0.908475  0.936082    0.881857    0.824146    0.779411  


##### Threshold 65%

In [None]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.65)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [None]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.708298      0.620385  0.639536         0.866147      0.772048   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.816395  0.908475  0.936082    0.881857    0.824146    0.779411  


##### Threshold 50%

In [None]:
group_df = evaluate_ranking_metrics(true_df=true_df, false_df=false_df, model_name=model_name, threshold=0.8)
result_df = summarize_metrics(group_df)

###### Evaluation Result

In [None]:
print(result_df.head())

   macro_precision  macro_recall  macro_f1  micro_precision  micro_recall  \
0         0.597645      0.373795  0.431672           0.9046      0.549455   

   micro_f1  mean_map  mean_mrr  mean_top@1  mean_top@2  mean_top@3  
0  0.683656  0.908475  0.936082    0.881857    0.824146    0.779411  
