In [None]:
import json
import os

import pandas as pd
from tqdm import tqdm

from dataloader import build_eval_dataset


In [None]:
DATA_FOLDER = "data/IR-Plag-Dataset"
eval_df = build_eval_dataset(DATA_FOLDER)
eval_df

In [None]:
def get_similarities() -> tuple:
    base_path = "data/jplag/results"
    file_names = [
        "sample_2_sample_2.java-sample_1_sample_1.java.json",
        "sample_1_sample_1.java-sample_2_sample_2.java.json",
    ]

    for file_name in file_names:
        file_path = os.path.join(base_path, file_name)
        if os.path.exists(file_path):
            with open(file_path) as f:
                data = json.load(f)
            return data["first_similarity"], data["second_similarity"]

    raise FileNotFoundError(
        "No matching similarity file found in the results directory."
    )


first, second = [], []

jplag_df = eval_df.copy()
for _, row in tqdm(eval_df.iterrows(), total=eval_df.shape[0]):
    try:
        sample_1 = row["sample_1"]
        sample_2 = row["sample_2"]

        folder_path = "data/jplag/{name}"

        # create folder
        os.makedirs(folder_path.format(name="sample_1"), exist_ok=True)
        os.makedirs(folder_path.format(name="sample_2"), exist_ok=True)

        # in sample_1 folder create file with content of sample_1
        with open(folder_path.format(name="sample_1") + "/sample_1.java", "w") as f:
            f.write(sample_1)

        # in sample_2 folder create file with content of sample_2
        with open(folder_path.format(name="sample_2") + "/sample_2.java", "w") as f:
            f.write(sample_2)

        # run this command - jplag -l java -r /data/jplag/jplag_results data/jplag/test_1 data/jplag/test_2
        os.system(
            "java -jar /Users/williambrach/Developer/jplag/jplag-5.1.0-jar-with-dependencies.jar -l java -r data/jplag/jplag_results data/jplag/sample_1 data/jplag/sample_2"
        )
        # time.sleep(1)
        # # unzip /data/jplag/jplag_results
        os.system("unzip data/jplag/jplag_results -d data/jplag/results")
        # time.sleep(1)
        # get similarities
        similarities = get_similarities()
        first.append(similarities[0])
        second.append(similarities[1])
    except Exception as e:
        # print all files in data/jplag/results/
        print(os.listdir("data/jplag/results"))
        print(e)
        break
    finally:
        # clean up the folders
        os.system("rm -rf data/jplag/sample_1")
        os.system("rm -rf data/jplag/sample_2")
        os.system("rm -rf data/jplag/results")
        os.system("rm -rf data/jplag/jplag_results.zip")

jplag_df["jplag_similarity_1"] = first
jplag_df["jplag_similarity_2"] = second
jplag_df.to_csv("data/results/jplag_results.csv", index=False)

In [14]:
df = pd.read_csv("data/results/jplag_results.csv")
df["avg_similarity"] = round(
    (df["jplag_similarity_1"] + df["jplag_similarity_2"]) / 2, 3
)
for thrashhold in [0.3, 0.5, 0.7, 0.75, 0.9]:
    x = df.copy()
    x["pred_predicted"] = df["avg_similarity"] > thrashhold
    x["pred_explanation"] = ""
    x.to_csv(f"data/results/jplag_results_{thrashhold}.csv", index=False)