### Data preparation and loading

In [None]:
# --------------------------------------------------------------
# @file: etl.py
# @authors: 
#   Ricardo Fernández  - A01704813
#   Arturo Díaz        - A01709522
#   Yuna Chung         - A01709043
# --------------------------------------------------------------

import os
import re
import pandas as pd

# --------------------------------------------------------------
# @brief: Reads the content of a file given its filename
# @param filename: Name of the file to read
# @return: Content of the file as a string
# --------------------------------------------------------------
def read_file_content(filename, base_path):
    file_path = os.path.join(base_path, f"{filename}.java")
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    except FileNotFoundError:
        return ""
    
# --------------------------------------------------------------
# @brief: Gets the first code file found in a directory
# @param path: Path to the directory to search
# @return: Full path to the first Java file found, or None if not found
# --------------------------------------------------------------
def get_code_file_from_dir(path):
    for file in os.listdir(path):
        if file.endswith(".java"):
            return os.path.join(path, file)
    return None

# --------------------------------------------------------------
# @brief: Cleans a given text by removing all whitespace characters
# @param text: Text to clean
# @return: Cleaned text with no whitespace characters
# --------------------------------------------------------------
def clean_text(text: str) -> str:
    text = text.replace("\t", "")
    text = text.replace("\r", " ").replace("\n", " ")
    text = re.sub(r" {2,}", " ", text)
    return text.strip()

# --------------------------------------------------------------
# @brief: Extracts data from the CONPLAG_Version_2 Dataset
# @param base_path: Base path where the dataset is located
# @return: DataFrame with the extracted data
# --------------------------------------------------------------
def extractConplag(base_path="../dataset/CONPLAG_VERSION2/versions"):
    labels_df = pd.read_csv(os.path.join(base_path, "labels.csv"))
    df = labels_df[["sub1", "sub2", "verdict"]].copy()
    df.columns = ["sub1", "sub2", "Plagiarized"]
    df["Plagiarized"] = df["Plagiarized"].astype(int)

    contents1 = []
    contents2 = []

    for _, row in df.iterrows():
        sub1 = row["sub1"]
        sub2 = row["sub2"]
        folder_name = f"{sub1}_{sub2}"

        # Paths
        version1_file = os.path.join(base_path, "version_1", folder_name, f"{sub1}.java")
        version2_file = os.path.join(base_path, "version_2", folder_name, f"{sub2}.java")

        # Leer archivos
        content1 = ""
        content2 = ""
        try:
            with open(version1_file, "r", encoding="utf-8") as f1:
                content1 = clean_text(f1.read())
        except FileNotFoundError:
            print(f"[WARN] Archivo no encontrado: {version1_file}")
        
        try:
            with open(version2_file, "r", encoding="utf-8") as f2:
                content2 = clean_text(f2.read())
        except FileNotFoundError:
            print(f"[WARN] Archivo no encontrado: {version2_file}")

        contents1.append(content1)
        contents2.append(content2)

    df["File1"] = contents1
    df["File2"] = contents2

    return df[["File1", "File2", "Plagiarized"]]


# --------------------------------------------------------------
# @brief: Extacts data from the FIRE14 Dataset
# @param qrel_path: Path to the qrel file
# @param base_path: Base path where the Java files are located
# @return: DataFrame with the extracted data
# --------------------------------------------------------------
def extractFire14(base_path="../dataset/FIRE14"):
    qrel_path = os.path.join(base_path, "SOCO14-java.qrel")
    java_path = os.path.join(base_path, "java")

    # Leer pares plagiados (qrel)
    plag_pairs = []
    with open(qrel_path, "r") as f:
        for line in f:
            f1, f2 = line.strip().split()
            plag_pairs.append((f1, f2))

    # Lista de todos los archivos .java
    all_files = [f for f in os.listdir(java_path) if f.endswith(".java")]

    # Crear DataFrame para pares positivos (plagio)
    data = []
    for f1, f2 in plag_pairs:
        content1 = clean_text(read_file_content(f1.replace(".java", ""), java_path))
        content2 = clean_text(read_file_content(f2.replace(".java", ""), java_path))
        data.append({"File1": content1, "File2": content2, "Plagiarized": 1})

    # Crear pares negativos (no plagio) - pares no listados en plag_pairs
    plag_set = set(plag_pairs)
    # Generar combinaciones de pares (sin ordenar)
    for i in range(len(all_files)):
        for j in range(i + 1, len(all_files)):
            f1 = all_files[i]
            f2 = all_files[j]
            pair = (f1, f2)
            if pair not in plag_set and (f2, f1) not in plag_set:
                content1 = clean_text(read_file_content(f1.replace(".java", ""), java_path))
                content2 = clean_text(read_file_content(f2.replace(".java", ""), java_path))
                data.append({"File1": content1, "File2": content2, "Plagiarized": 0})

    df = pd.DataFrame(data)
    return df

# --------------------------------------------------------------
# @brief: Extacts data from the IRPLAG Dataset
# @param base_path: Base path where the IR-Plag Dataset is located
# @return: DataFrame with the extracted data
# --------------------------------------------------------------
def extractIRPlag(base_path="../dataset/IRPLAG"):
    data = []

    # Iterar sobre cada "case-*" folder
    for case_dir in os.listdir(base_path):
        case_path = os.path.join(base_path, case_dir)
        if not os.path.isdir(case_path):
            continue

        # Obtener archivo original (único archivo dentro de /original)
        original_dir = os.path.join(case_path, "original")
        original_file_path = get_code_file_from_dir(original_dir)
        if not original_file_path:
            continue
        with open(original_file_path, "r", encoding="utf-8") as f:
            original_content = f.read()
        original_content = clean_text(original_content)

        # Agregar pares con plagio
        plag_base = os.path.join(case_path, "plagiarized")
        for level in os.listdir(plag_base):
            level_path = os.path.join(plag_base, level)
            for instance in os.listdir(level_path):
                file_path = get_code_file_from_dir(os.path.join(level_path, instance))
                if not file_path:
                    continue
                with open(file_path, "r", encoding="utf-8") as f:
                    plag_content = f.read()
                plag_content = clean_text(plag_content)
                data.append({
                    "File1": original_content,
                    "File2": plag_content,
                    "Plagiarized": 1
                })

        # Agregar pares sin plagio
        non_plag_base = os.path.join(case_path, "non-plagiarized")
        for instance in os.listdir(non_plag_base):
            file_path = get_code_file_from_dir(os.path.join(non_plag_base, instance))
            if not file_path:
                continue
            with open(file_path, "r", encoding="utf-8") as f:
                non_plag_content = f.read()
            non_plag_content = clean_text(non_plag_content)
            data.append({
                "File1": original_content,
                "File2": non_plag_content,
                "Plagiarized": 0
            })

    df = pd.DataFrame(data)
    return df

In [32]:
def merge_all_datasets():
    # Obtengo los datasets
    df_conplag = extractConplag()
    df_fire14 = extractFire14()
    df_irplag = extractIRPlag()

    # Agrego una columna para identificar la fuente
    df_conplag["Source"] = "Conplag"
    df_fire14["Source"] = "Fire14"
    df_irplag["Source"] = "IRPlag"

    # Uno los DataFrames
    final_df = pd.concat([df_conplag, df_fire14, df_irplag], ignore_index=True)

    # Mezclo las filas aleatoriamente
    final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

    return final_df

In [33]:
df = merge_all_datasets()
print(f"Total rows in merged dataset: {len(df)}")

# Exporto el DataFrame a CSV, JSON y Parquet
df.to_csv("../dataset/dataset.csv", index=False)
df.to_json("../dataset/dataset.jsonl", orient="records", lines=True)
df.to_parquet("../dataset/dataset.parquet", index=False)

Total rows in merged dataset: 34782
