In [12]:
import pandas as pd
from collections import Counter
from pathlib import Path

# =========================
# CONFIGURATION DES DATASETS
# =========================
DATASETS = {
    "iris": {
        "data_file": "../data/iris/iris.data",
        "fd_file": "../data/iris/iris_fds.txt",
        "columns": [
            "sepal_length", "sepal_width",
            "petal_length", "petal_width",
            "class"
        ]
    },

    "adult": {
        "data_file": "../data/adult/adult.data",
        "fd_file": "../data/adult/adult_fds.txt",
        "columns": [
            "age",
            "workclass",
            "fnlwgt",
            "education",
            "education_num",
            "marital_status",
            "occupation",
            "relationship",
            "race",
            "sex",
            "capital_gain",
            "capital_loss",
            "hours_per_week",
            "native_country",
            "income"
        ]
    }
}


# =========================
# FONCTION DE LECTURE DES FDs (TANE)
# =========================
def load_fds(fd_file, column_names):
    fds = []
    with open(fd_file, "r") as f:
        in_results = False
        for line in f:
            line = line.strip()
            if line.startswith("# RESULTS"):
                in_results = True
                continue
            if in_results and "->" in line:
                lhs, rhs = line.split("->")
                lhs_idx = [int(i) - 1 for i in lhs.split(",")]
                rhs_idx = int(rhs) - 1
                lhs_names = [column_names[i] for i in lhs_idx]
                rhs_name = column_names[rhs_idx]
                fds.append((lhs_names, rhs_name))
    return fds

# =========================
# ANALYSE POUR CHAQUE DATASET
# =========================
for dataset_name, cfg in DATASETS.items():

    print("\n" + "=" * 60)
    print(f"DATASET : {dataset_name.upper()}")
    print("=" * 60)

    # --- 1. Charger le dataset ---
    df = pd.read_csv(cfg["data_file"], names=cfg["columns"])
    print("Shape :", df.shape)

    # --- 2. Charger les FDs ---
    fds = load_fds(cfg["fd_file"], cfg["columns"])
    print("Nombre de FDs :", len(fds))

    # --- 3. Taille moyenne du LHS ---
    lhs_sizes = [len(lhs) for lhs, _ in fds]
    avg_lhs_size = sum(lhs_sizes) / len(lhs_sizes)
    print("Taille moyenne du LHS :", round(avg_lhs_size, 2))

    # --- 4. Fréquence des attributs ---
    lhs_counter = Counter()
    rhs_counter = Counter()

    for lhs, rhs in fds:
        lhs_counter.update(lhs)
        rhs_counter.update([rhs])

    print("\nFréquence des attributs (LHS) :")
    for attr, count in lhs_counter.items():
        print(f"  {attr} : {count}")

    print("\nFréquence des attributs (RHS) :")
    for attr, count in rhs_counter.items():
        print(f"  {attr} : {count}")

    # --- 5. Affichage des FDs ---
    print("\nDépendances fonctionnelles :")
    for lhs, rhs in fds:
        print(f"  {', '.join(lhs)} -> {rhs}")

    # --- 6. FDs triviales / suspectes ---
    print("\nFDs triviales ou suspectes :")
    found = False
    for lhs, rhs in fds:
        if len(lhs) == 1 and lhs[0].lower() in ["id", "index"]:
            print(f"  Triviale : {lhs} -> {rhs}")
            found = True
        elif len(lhs) > 3:
            print(f"  LHS très grande : {lhs} -> {rhs}")
            found = True

    if not found:
        print("  Aucune")



DATASET : IRIS
Shape : (150, 5)
Nombre de FDs : 4
Taille moyenne du LHS : 3.0

Fréquence des attributs (LHS) :
  petal_length : 3
  sepal_width : 3
  sepal_length : 3
  petal_width : 3

Fréquence des attributs (RHS) :
  class : 4

Dépendances fonctionnelles :
  petal_length, sepal_width, sepal_length -> class
  petal_width, sepal_width, sepal_length -> class
  petal_width, petal_length, sepal_length -> class
  petal_width, petal_length, sepal_width -> class

FDs triviales ou suspectes :
  Aucune

DATASET : ADULT
Shape : (32561, 15)
Nombre de FDs : 0


ZeroDivisionError: division by zero

In [16]:
import pandas as pd
from collections import Counter
from pathlib import Path

# =========================
# CONFIGURATION DES DATASETS
# =========================
DATASETS = {
    "iris": {
        "data_file": "../data/iris/iris.data",
        "fd_file": "../data/iris/iris_fds.txt",
        "names_file": "../data/iris/iris.names"
    },

    "adult": {
        "data_file": "../data/adult/adult.data",
        "fd_file": "../data/adult/adult_fds.txt",
        # Noms de colonnes hardcodés pour Adult
        "columns": [
            "age",
            "workclass",
            "fnlwgt",
            "education",
            "education_num",
            "marital_status",
            "occupation",
            "relationship",
            "race",
            "sex",
            "capital_gain",
            "capital_loss",
            "hours_per_week",
            "native_country",
            "income"
        ]
    }
}

# =========================
# EXTRACTION DES COLONNES POUR IRIS
# =========================
def extract_columns_from_names(names_file):
    columns = []
    in_attr_section = False
    with open(names_file, "r") as f:
        for line in f:
            line = line.strip()
            # Début de la section Attribute Information
            if "Attribute Information" in line or "Attribute information" in line:
                in_attr_section = True
                continue
            if in_attr_section:
                if line == "" or line.startswith("Summary Statistics") or line.startswith("Missing"):
                    break
                match = re.match(r"^\d+\.\s*([a-zA-Z0-9_\- ]+)", line)
                if match:
                    col = match.group(1)
                    col = col.lower().replace(" ", "_").replace("-", "_")
                    columns.append(col)
                if ">50K" in line or "<=50K" in line:
                    if "income" not in columns:
                        columns.append("income")
    return columns

# =========================
# FONCTION DE LECTURE DES FDs (TANE)
# =========================
def load_fds(fd_file, column_names):
    fds = []
    if not Path(fd_file).exists():
        return fds
    with open(fd_file, "r") as f:
        in_results = False
        for line in f:
            line = line.strip()
            if line.startswith("# RESULTS"):
                in_results = True
                continue
            if in_results and "->" in line:
                lhs, rhs = line.split("->")
                lhs_idx = [int(i) - 1 for i in lhs.split(",")]
                rhs_idx = int(rhs) - 1
                lhs_names = [column_names[i] for i in lhs_idx]
                rhs_name = column_names[rhs_idx]
                fds.append((lhs_names, rhs_name))
    return fds

# =========================
# ANALYSE POUR CHAQUE DATASET
# =========================
for dataset_name, cfg in DATASETS.items():

    print("\n" + "=" * 60)
    print(f"DATASET : {dataset_name.upper()}")
    print("=" * 60)

    # --- 1. Déterminer les colonnes ---
    if dataset_name == "iris":
        columns = extract_columns_from_names(cfg["names_file"])
    else:
        columns = cfg["columns"]

    print("Colonnes :", columns)

    # --- 2. Charger le dataset ---
    df = pd.read_csv(cfg["data_file"], names=columns)
    print("Shape :", df.shape)

    # --- 3. Charger les FDs ---
    fds = load_fds(cfg["fd_file"], columns)
    print("Nombre de FDs :", len(fds))

    # --- 4. Taille moyenne du LHS ---
    lhs_sizes = [len(lhs) for lhs, _ in fds]
    if len(lhs_sizes) > 0:
        avg_lhs_size = sum(lhs_sizes) / len(lhs_sizes)
        print("Taille moyenne du LHS :", round(avg_lhs_size, 2))
    else:
        print("Taille moyenne du LHS : N/A (aucune FD trouvée)")

    # --- 5. Fréquence des attributs ---
    lhs_counter = Counter()
    rhs_counter = Counter()
    for lhs, rhs in fds:
        lhs_counter.update(lhs)
        rhs_counter.update([rhs])

    print("\nFréquence des attributs (LHS) :")
    for attr, count in lhs_counter.items():
        print(f"  {attr} : {count}")

    print("\nFréquence des attributs (RHS) :")
    for attr, count in rhs_counter.items():
        print(f"  {attr} : {count}")

    # --- 6. Affichage des FDs ---
    print("\nDépendances fonctionnelles :")
    if fds:
        for lhs, rhs in fds:
            print(f"  {', '.join(lhs)} -> {rhs}")
    else:
        print("  Aucune FD trouvée")

    # --- 7. FDs triviales / suspectes ---
    print("\nFDs triviales ou suspectes :")
    found = False
    for lhs, rhs in fds:
        if len(lhs) == 1 and lhs[0].lower() in ["id", "index"]:
            print(f"  Triviale : {lhs} -> {rhs}")
            found = True
        elif len(lhs) > 3:
            print(f"  LHS très grande : {lhs} -> {rhs}")
            found = True
    if not found:
        print("  Aucune")



DATASET : IRIS
Colonnes : ['sepal_length_in_cm', 'sepal_width_in_cm', 'petal_length_in_cm', 'petal_width_in_cm', 'class']
Shape : (150, 5)
Nombre de FDs : 4
Taille moyenne du LHS : 3.0

Fréquence des attributs (LHS) :
  petal_length_in_cm : 3
  sepal_width_in_cm : 3
  sepal_length_in_cm : 3
  petal_width_in_cm : 3

Fréquence des attributs (RHS) :
  class : 4

Dépendances fonctionnelles :
  petal_length_in_cm, sepal_width_in_cm, sepal_length_in_cm -> class
  petal_width_in_cm, sepal_width_in_cm, sepal_length_in_cm -> class
  petal_width_in_cm, petal_length_in_cm, sepal_length_in_cm -> class
  petal_width_in_cm, petal_length_in_cm, sepal_width_in_cm -> class

FDs triviales ou suspectes :
  Aucune

DATASET : ADULT
Colonnes : ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
Shape : (32561, 15)
Nombre de FDs : 0
Taille moyenne du LHS : N/A