Ce script lit des fichiers `.json` contenant des graphes DOT, extrait des features avancées (structurelles et instruction-level), puis sauvegarde les résultats par batch au format CSV.  
Il supprime automatiquement les JSON déjà traités pour éviter les doublons et optimise le tout via `joblib.Parallel`.


In [None]:
import os
import re
import glob
import gc
import pandas as pd
import networkx as nx
from tqdm import tqdm
from joblib import Parallel, delayed


def parse_cfg_dot(dot_str):
    """
    Parse un fichier DOT en chaîne de caractères et retourne un graphe orienté NetworkX.
    """
    G = nx.DiGraph()
    lines = dot_str.strip().split('\n')

    for line in lines:
        line = line.strip()
        if '->' in line:
            match = re.match(r'"([^"]+)"\s*->\s*"([^"]+)"', line)
            if match:
                src, dst = match.groups()
                G.add_edge(src, dst)
        elif '[' in line and 'label =' in line:
            match = re.match(r'"([^"]+)"\s*\[label = "(.*?)"\]', line)
            if match:
                node_id, label = match.groups()
                parts = label.split(' : ')
                addr = parts[0]
                instr_type = parts[1] if len(parts) > 1 else "UNK"
                instr_text = parts[2] if len(parts) > 2 else ""
                G.add_node(node_id, addr=addr, type=instr_type, text=instr_text)
    return G


def load_single_graph(file_path):
    """
    Charge un fichier .json contenant un graphe DOT et retourne un tuple (graph_id, graphe).
    """
    try:
        with open(file_path, "rb") as f:
            dot_str = f.read().decode("utf-8", errors="ignore")
        if not dot_str.strip():
            return None
        G = parse_cfg_dot(dot_str)
        if G.number_of_nodes() == 0:
            return None
        return os.path.basename(file_path).replace(".json", ""), G
    except Exception as e:
        print(f"Erreur avec {file_path}: {e}")
        return None


def extract_features_from_graph(graph_id, G):
    """
    Extrait des features avancées depuis un graphe NetworkX.
    """
    instr_types = nx.get_node_attributes(G, "type").values()
    instr_texts = nx.get_node_attributes(G, "text").values()

    instr_types_series = pd.Series(instr_types)
    instr_texts_series = pd.Series(instr_texts)
    instr_counts = instr_types_series.value_counts()
    total_instrs = len(instr_texts_series)

    def count_ratio_text(substring):
        matches = instr_texts_series.str.contains(substring, case=False, na=False)
        count = matches.sum()
        return count, count / total_instrs if total_instrs > 0 else 0

    def count_ratio_type(op):
        count = instr_counts.get(op.upper(), 0)
        return count, count / total_instrs if total_instrs > 0 else 0

    exit_nodes = [n for n in G.nodes if G.out_degree(n) == 0]
    try:
        longest_path_len = nx.dag_longest_path_length(G)
    except Exception:
        longest_path_len = -1

    features = {
        "graph_id": graph_id,
        "nb_nodes": G.number_of_nodes(),
        "nb_edges": G.number_of_edges(),
        "nb_components": nx.number_weakly_connected_components(G),
        "nb_exit_nodes": len(exit_nodes),
        "max_path_len": longest_path_len,
        "avg_degree": G.number_of_edges() / G.number_of_nodes() if G.number_of_nodes() > 0 else 0,
        "count_inst": total_instrs,
        "prop_inst": total_instrs / G.number_of_nodes() if G.number_of_nodes() > 0 else 0,
    }

    for opcode in ["CALL", "JMP", "RET", "JCC", "INVALID", "HLT", "INST"]:
        count, ratio = count_ratio_type(opcode)
        features[f"count_type_{opcode.lower()}"] = count
        features[f"prop_type_{opcode.lower()}"] = ratio

    for keyword in ["mov", "add", "xor", "cmp", "push", "pop", "lea", "sub", "loop", "syscall", "call", "jmp"]:
        count, ratio = count_ratio_text(keyword)
        features[f"count_text_{keyword}"] = count
        features[f"prop_text_{keyword}"] = ratio

    return features


def clean_jsons_from_existing_csv(csv_dir, folder_path):
    """
    Supprime les fichiers .json déjà traités en vérifiant les CSV de features existants.
    """
    print("Nettoyage des JSON déjà traités à partir des CSV existants...")
    all_csv = sorted(glob.glob(os.path.join(csv_dir, "features_batch_*.csv")))
    treated_ids = set()

    for csv_file in all_csv:
        try:
            df = pd.read_csv(csv_file, usecols=["graph_id"])
            treated_ids.update(df["graph_id"].dropna().astype(str).tolist())
        except Exception as e:
            print(f"Erreur lecture {csv_file} : {e}")

    print(f"{len(treated_ids)} fichiers déjà extraits détectés dans les CSV.")

    deleted = 0
    for gid in treated_ids:
        json_path = os.path.join(folder_path, f"{gid}.json")
        if os.path.exists(json_path):
            try:
                os.remove(json_path)
                deleted += 1
            except Exception as e:
                print(f"Impossible de supprimer {json_path} : {e}")

    print(f"{deleted} fichiers JSON supprimés car déjà présents dans les CSV.")


def extract_features_to_csv_by_batch(folder_path, csv_dir="features_batches", n_jobs=4, batch_size=500):
    """
    Pipeline principale : lit les fichiers JSON → extrait les graphes → calcule les features → sauvegarde CSV.
    """
    os.makedirs(csv_dir, exist_ok=True)
    clean_jsons_from_existing_csv(csv_dir, folder_path)

    files = sorted([os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".json")])
    total_files = len(files)
    print(f"{total_files} fichiers JSON trouvés dans {folder_path}")

    for i in range(0, total_files, batch_size):
        batch_files = files[i:i + batch_size]
        batch_num = i // batch_size + 1
        csv_path = os.path.join(csv_dir, f"features_batch_{batch_num:02}.csv")

        if os.path.exists(csv_path):
            print(f"Batch {batch_num:02} déjà existant. Skip.")
            continue

        print(f"\nBatch {batch_num:02} → {len(batch_files)} fichiers à parser...")

        graph_results = Parallel(n_jobs=n_jobs, backend="loky")(
            delayed(load_single_graph)(file) for file in tqdm(batch_files, desc=f"Loading batch {batch_num:02}")
        )
        graph_results = [res for res in graph_results if res is not None]

        features = Parallel(n_jobs=n_jobs)(
            delayed(extract_features_from_graph)(gid, G) for gid, G in graph_results
        )

        df = pd.DataFrame(features)
        df.to_csv(csv_path, index=False)
        print(f"Features batch {batch_num:02} enregistrées dans {csv_path}")

        processed_files = [os.path.join(folder_path, f"{graph_id}.json") for graph_id, _ in graph_results]
        for f in processed_files:
            try:
                os.remove(f)
            except Exception as e:
                print(f"Impossible de supprimer {f} : {e}")

        gc.collect()

    print("\nTous les batchs disponibles ont été extraits vers CSV.")


In [None]:
folder_path = "folder_test_set"
extract_features_to_csv_by_batch(folder_path, csv_dir="features_batches", n_jobs=4, batch_size=100)

---

Ce script merge les différents batchs !

In [None]:
import pandas as pd
import glob

csv_dir = "data/features_batches" 

csv_files = sorted(glob.glob(f"{csv_dir}/features_batch_*.csv"))
print(f"{len(csv_files)} fichiers trouvés.")

df_all = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

df_all = df_all.drop_duplicates(subset="graph_id")

print(f"Nombre total de graph_id uniques : {df_all['graph_id'].nunique()}")

output_path = "data/train.csv"
df_all.to_csv(output_path, index=False)

print(df_all.tail())
