In [None]:
import os
import re

import dspy
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from tqdm import tqdm

from dataloader import build_eval_dataset, check_if_data_folder_exits
from models import CoT, stop_model

In [None]:
# load environment variables
load_dotenv()

AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT")
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")
OLLAMA_URL = os.getenv("OLLAMA_URL")

# azure-openai-gpt-35-turbo
AZURE_OPENAI_KEY_35_TURBO = os.getenv("AZURE_OPENAI_KEY_35_TURBO")
AZURE_OPENAI_VERSION_35_TURBO = os.getenv("AZURE_OPENAI_VERSION_35_TURBO")
AZURE_OPENAI_DEPLOYMENT_35_TURBO = os.getenv("AZURE_OPENAI_DEPLOYMENT_35_TURBO")
AZURE_OPENAI_ENDPOINT_35_TURBO = os.getenv("AZURE_OPENAI_ENDPOINT_35_TURBO")

DATA_FOLDER = "data/IR-Plag-Dataset"
check_if_data_folder_exits(DATA_FOLDER)

In [None]:
eval_df = build_eval_dataset(DATA_FOLDER)
eval_df

In [None]:
llm_name = [
    "azure-gpt-35-turbo",
    "azure-gpt-4o",
    "ollama-phi3",
    "ollama-codegemma",
    "ollama-codellama:13b",
    "ollama-codestral",
    "ollama-deepseek-coder",
    "ollama-gemma2",
    "ollama-llamma3",
    "ollama-mistral",
]
program_save_path = "programs/{llm_name}_{optimizer}_{score}"


if llm_name == "azure-gpt-4o":
    lm = dspy.AzureOpenAI(
        api_base=AZURE_OPENAI_ENDPOINT,
        api_version=AZURE_OPENAI_VERSION,
        deployment_id=AZURE_OPENAI_DEPLOYMENT,
        api_key=AZURE_OPENAI_KEY,
    )
elif llm_name == "azure-gpt-35-turbo":
    lm = dspy.AzureOpenAI(
        api_base=AZURE_OPENAI_ENDPOINT_35_TURBO,
        api_version=AZURE_OPENAI_VERSION_35_TURBO,
        deployment_id=AZURE_OPENAI_DEPLOYMENT_35_TURBO,
        api_key=AZURE_OPENAI_KEY_35_TURBO,
    )
elif "ollama" in llm_name:
    model_name = "-".join(llm_name.split("-")[1:])
    print(model_name)
    print(OLLAMA_URL)
    lm = dspy.OllamaLocal(base_url=OLLAMA_URL, model=model_name)
else:
    raise ValueError(f"Unknown LLM name: {llm_name}")
dspy.settings.configure(lm=lm)

In [None]:
def create_example(row: pd.Series) -> dspy.Example:
    return dspy.Example(
        code_sample_1=row["sample_1"],
        code_sample_2=row["sample_2"],
        plagiarized="Yes" if row["plagiarized"] else "No",
        explanation=row["reason"],
    ).with_inputs("code_sample_1", "code_sample_2")


def clean(text: str) -> str:
    pred_plag = text.strip().lower().split("\n")[0]
    yes_no_pattern = r"\b(yes|no)\b"
    match = re.search(yes_no_pattern, pred_plag)
    extracted_answer = match.group(1) if match else pred_plag
    return True if extracted_answer == "yes" else False


def eval_program(program: dspy.Module, df: pd.DataFrame) -> pd.DataFrame:
    predicted = []
    explanations = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        example = create_example(row)
        pred = program(
            code_sample_1=example.code_sample_1, code_sample_2=example.code_sample_2
        )
        pred_plag = clean(pred.plagiarized)
        predicted.append(pred_plag)
        explanations.append(pred.explanation)
    df["pred_predicted"] = predicted
    df["pred_explanation"] = explanations
    return df


# find all programs in /programs that got {llm_name} in name
programs = [file for file in os.listdir("programs") if llm_name in file.lower()]
print(programs)

for program_path in programs:
    print(f"Running {program_path}")
    program = CoT()
    program.load(path=f"programs/{program_path}")
    df = eval_program(program, eval_df)
    df.to_csv(f"data/results/{program_path}.csv", index=False)
if "ollama" in llm_name:
    stop_model(llm_name, OLLAMA_URL)

In [None]:
def calculate_metrics(y_true: list, y_pred: list) -> dict:
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
    }

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Calculate FPR and FNR
    metrics["fpr"] = fp / (fp + tn) if (fp + tn) > 0 else 0
    metrics["fnr"] = fn / (fn + tp) if (fn + tp) > 0 else 0

    # Only calculate ROC AUC if there are two classes
    if len(np.unique(y_true)) == 2:
        metrics["roc_auc"] = roc_auc_score(y_true, y_pred)
    else:
        metrics["roc_auc"] = np.nan

    return metrics


results = list(os.listdir("data/results"))

dfs = []
for result in results:
    if "overall" in result or result == "jplag_results.csv":
        continue
    df = pd.read_csv(f"data/results/{result}")
    overall_metrics = calculate_metrics(df["plagiarized"], df["pred_predicted"])
    overall_df = pd.DataFrame([overall_metrics], index=[result])
    for i in range(0, 7):
        overall_df[f"L{i}_accuracy"] = accuracy_score(
            df[df["L"] == i]["plagiarized"], df[df["L"] == i]["pred_predicted"]
        )
    dfs.append(overall_df)

df = pd.concat(dfs)
df

In [None]:
df.to_csv("data/results/overall_results.csv")