### Data preparation and loading

In [None]:
# -------------------------------------
# @file: etl.py
# @authors: 
#   Ricardo Fernández  - A01704813
#   Arturo Díaz        - A01709522
#   Yuna Chung         - A01709043
# -------------------------------------

import os
import pandas as pd
from itertools import combinations

# ------------------------------------
# @brief: Extracts data from the CONPLAG_Version_2 Dataset
# @return: DataFrame with the extracted data
# ------------------------------------
def extractConplag():
    labels_df = pd.read_csv("../dataset/CONPLAG_VERSION2/versions/labels.csv")
    df = labels_df[["sub1", "sub2", "verdict"]].copy()
    df.columns = ["File1", "File2", "Plagiarized"]
    df["Plagiarized"] = df["Plagiarized"].astype(int)
    return df

# ------------------------------------
# @brief: Extacts data from the FIRE14 Dataset
# @return: DataFrame with the extracted data
# ------------------------------------
def extractFire14():
    qrel_path = "../dataset/fire14-source-code-training-dataset/SOCO14-java.qrel"
    java_dir = "../dataset/FIRE14/java"

    # Generar pares con plagio
    plagiarized_pairs = []
    with open(qrel_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 4:
                file1, _, file2, label = parts
                if label == "1":
                    plagiarized_pairs.append((file1, file2, 1))

    df_pos  = pd.DataFrame(plagiarized_pairs, columns=["File1", "File2", "Plagiarized"])

    # Generar pares sin plagio
    file_list = [f for f in os.listdir(java_dir) if f.endswith(".java")]
    all_pairs = list(combinations(file_list, 2))
    positive_set = set((a, b) for a, b, _ in plagiarized_pairs)

    negative_pairs = []
    for a, b in all_pairs:
        if (a, b) not in positive_set and (b, a) not in positive_set:
            negative_pairs.append((a, b, 0))

    negative_pairs = negative_pairs[:len(df_pos)*2]
    df_neg = pd.DataFrame(negative_pairs, columns=["File1", "File2", "Plagiarized"])

    return pd.concat([df_pos, df_neg], ignore_index=True)

# ------------------------------------
# @brief: Extacts data from the IRPLAG Dataset
# @return: DataFrame with the extracted data
# ------------------------------------
def extractIRPLAG(base_dir="../dataset/IRPLAG"):
        
        def get_code_file_from_dir(path):
            for file in os.listdir(path):
                if file.endswith(".java"):
                    return os.path.join(path, file)
            return None

        data = []

        # Iterar sobre cada tarea (task1, task2, ...)
        for task in os.listdir(base_dir):
            task_path = os.path.join(base_dir, task)
            if not os.path.isdir(task_path):
                continue

            # Obtener archivo original
            original_dir = os.path.join(task_path, "Original")
            original_file = get_code_file_from_dir(original_dir)
            if original_file is None:
                continue

            # Comparar con no plagiados
            non_plag_dir = os.path.join(task_path, "non-plagiarized")
            if os.path.isdir(non_plag_dir):
                for subdir in os.listdir(non_plag_dir):
                    subdir_path = os.path.join(non_plag_dir, subdir)
                    file_path = get_code_file_from_dir(subdir_path)
                    if file_path:
                        data.append((original_file, file_path, 0))

            # Comparar con plagiados (por niveles)
            plag_dir = os.path.join(task_path, "plagiarized")
            if os.path.isdir(plag_dir):
                for level in os.listdir(plag_dir):  # level_1, ..., level_6
                    level_path = os.path.join(plag_dir, level)
                    if os.path.isdir(level_path):
                        for subdir in os.listdir(level_path):
                            subdir_path = os.path.join(level_path, subdir)
                            file_path = get_code_file_from_dir(subdir_path)
                            if file_path:
                                data.append((original_file, file_path, 1))

        # Crear DataFrame
        df = pd.DataFrame(data, columns=["File1", "File2", "Plagiarized"])
        return df