# Bim_Predict NoteBook

## Importing Libraries

In [None]:
# Import libraries
import os
import pandas as pd

# Define project folder paths
# Data directories
BASE_DIR = "../../"
DATA_DIR = os.path.join(BASE_DIR, "data")
RAW_DATA_DIR = os.path.join(DATA_DIR, "raw_data")
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, "processed_data")
PREDICTED_DATA_DIR = os.path.join(DATA_DIR, "predicting_data")
TESTING_DATA_DIR = os.path.join(DATA_DIR, "testing_data")

# Model directories
MODELS_DIR = os.path.join(BASE_DIR, "models")
ML_MODELS_DIR = os.path.join(MODELS_DIR, "SK/machine_learning")
DL_MODELS_DIR = os.path.join(MODELS_DIR, "SK/deep_learning")
OTHER_MODELS_DIR = os.path.join(MODELS_DIR, "SK/other")

# Python modules and plots directories
PYTHON_MODULES_DIR = os.path.join(BASE_DIR, "python_modules")
PLOTS_DIR = os.path.join(BASE_DIR, "plots")

# List of directories to create
directories = [
    RAW_DATA_DIR, PROCESSED_DATA_DIR, PREDICTED_DATA_DIR,
    MODELS_DIR, ML_MODELS_DIR, DL_MODELS_DIR, OTHER_MODELS_DIR,
    PYTHON_MODULES_DIR, PLOTS_DIR
]

# Create directories if they don't exist
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

<!-- ### Paths Creating && Data Importing -->

In [None]:
import os
import pandas as pd

# List all Excel files in RAW_DATA_DIR
excel_files = [f for f in os.listdir(RAW_DATA_DIR) if f.endswith(".xlsx") or f.endswith(".xls")]

# Dictionary to store DataFrames for each file and sheet
dataframes = {}

# Process each Excel file
for file in excel_files:
    file_path = os.path.join(RAW_DATA_DIR, file)
    print(f"Loading: {file_path}")

    try:
        # Load Excel file
        excel_data = pd.ExcelFile(file_path)

        # Load all sheets dynamically
        for sheet_name in excel_data.sheet_names:
            df = excel_data.parse(sheet_name)

            # Save DataFrame with a unique identifier
            dataframes[f"{file}_{sheet_name}"] = df

    except Exception as e:
        print(f"Error loading {file_path}: {e}")

# Display summary of loaded data
print(f"\nTotal files processed: {len(dataframes)}")
for key, df in dataframes.items():
    print(f"Loaded DataFrame: {key}, Shape: {df.shape}")

<!-- ### Data Cleaning && PreProcessing -->

## PreProcessing Data

In [None]:
# Define required columns dynamically
required_columns = {
    "Murs": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "Hauteur",
             "Epaisseur", "AI", "AS", "Sols en intersection", "Sols coupés (u)", "Sols coupés (Ids)",
             "Sols coupants (u)", "Sols coupants (Ids)", "Sol au-dessus", "Sol au-dessous", "Fenêtres", "Portes",
             "Ouvertures", "Murs imbriqués", "Mur multicouche", "Mur empilé", "Profil modifié", "Extension inférieure",
             "Extension supérieure", "Partie inférieure attachée", "Partie supérieure attachée", "Décalage supérieur",
             "Décalage inférieur", "Matériau structurel"],

    "Sols": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "Murs en intersection",
             "Murs coupés (u)", "Murs coupés (Ids)", "Murs coupants (u)", "Murs coupants (Ids)", "Poutres en intersection",
             "Poutres coupés (u)", "Poutres coupés (Ids)", "Poutres coupants (u)", "Poutres coupants (Ids)",
             "Poteaux en intersection", "Poteaux coupés (u)", "Poteaux coupés (Ids)", "Poteaux coupants (u)",
             "Poteaux coupants (Ids)", "Ouvertures", "Sol multicouche", "Profil modifié", "Décalage par rapport au niveau",
             "Epaisseur", "Lié au volume", "Etude de l'élévation à la base", "Etude de l'élévation en haut",
             "Epaisseur du porteur", "Elévation au niveau du noyau inférieur", "Elévation au niveau du noyau supérieur",
             "Elévation en haut", "Elévation à la base", "Matériau structurel"],

    "Poutres": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "AI", "AS",
                "Hauteur totale", "Hauteur", "Sols en intersection", "Sols coupés (u)", "Sols coupés (Ids)",
                "Sols coupants (u)", "Sols coupants (Ids)", "Sol au-dessus", "Sol au-dessous", "Poteaux en intersection",
                "Poteaux coupés (u)", "Poteaux coupés (Ids)", "Poteaux coupants (u)", "Poteaux coupants (Ids)",
                "Etat de la jonction", "Valeur de décalage Z", "Justification Z", "Valeur de décalage Y", "Justification Y",
                "Justification YZ", "Matériau structurel", "Elévation du niveau de référence", "Elévation en haut",
                "Rotation de la section", "Orientation", "Décalage du niveau d'arrivée", "Décalage du niveau de départ",
                "Elévation à la base", "Longueur de coupe", "Longueur", "hauteur_section", "largeur_section"],

    "Poteaux": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "AI", "AS",
                "Hauteur", "Longueur", "Partie inférieure attachée", "Partie supérieure attachée", "Sols en intersection",
                "Sols coupés (u)", "Sols coupés (Ids)", "Sols coupants (u)", "Sols coupants (Ids)", "Poutres en intersection",
                "Poutres coupés (u)", "Poutres coupés (Ids)", "Poutres coupants (u)", "Poutres coupants (Ids)",
                "Matériau structurel", "Décalage supérieur", "Décalage inférieur", "Diamètre poteau", "h", "b",
                "hauteur_section", "largeur_section"]
}

# Initialize a dictionary to store filtered dataframes
cleaned_dataframes = {}

for df_name, df in dataframes.items():
    print(f"\n🟢 Original shape of {df_name}: {df.shape}")

    # Automatically detect the correct category for filtering
    for category, columns in required_columns.items():
        if category.lower() in df_name.lower():  # Match dynamically
            try:
                filtered_df = df[columns]  # Keep only the required columns
            except KeyError as e:
                missing_columns = set(columns) - set(df.columns)
                print(f"⚠️ Missing columns in {df_name}: {missing_columns}. Skipping this dataframe.")
                continue
            cleaned_dataframes[df_name] = filtered_df
            print(f"✅ Shape after filtering {df_name}: {filtered_df.shape}")
            break  # Stop looping once the correct match is found
    else:
        print(f"⚠️ No matching category for {df_name}, skipping filtering.")

# # Add prefixes to column names based on the dataframe category and update index
# for name, df in cleaned_dataframes.items():
#     if "murs" in name.lower():
#         prefix = "murs_"
#     elif "sols" in name.lower():
#         prefix = "sols_"
#     elif "poutres" in name.lower():
#         prefix = "poutres_"
#     elif "poteaux" in name.lower():
#         prefix = "poteaux_"
#     else:
#         prefix = ""

#     # Rename columns with the prefix
#     df.rename(columns=lambda col: f"{prefix}{col}" if col.lower() != "id" else f"{prefix}id", inplace=True)

#     # Drop the existing index and set the prefixed ID column as the new index
#     id_column = f"{prefix}id"
#     if id_column in df.columns:
#         df.set_index(id_column, inplace=True)
#         print(f"✅ Set '{id_column}' as index for {name}.")
#     else:
#         print(f"⚠️ '{id_column}' column not found in {name}, skipping index setting.")

    # Update the cleaned_dataframes dictionary
    # cleaned_dataframes[df_name] = df

In [None]:
print("\n📊 Cleaned DataFrames:")
for df_name, df in cleaned_dataframes.items():
    print(f" - {df_name}: {df.shape}")


In [None]:
# def map_feature_names(cleaned_dataframes, required_columns):
#     """Maps cleaned dataframe column names to match required training feature names."""
#     mapped_dataframes = {}

#     for df_name, df in cleaned_dataframes.items():
#         for category, expected_columns in required_columns.items():
#             if category.lower() in df_name.lower():  # Match dynamically
#                 # Create mapping: {cleaned_col_name: expected_col_name}
#                 col_mapping = {cleaned_col: expected_col for cleaned_col in df.columns for expected_col in expected_columns if cleaned_col.lower() == expected_col.lower()}

#                 # Apply mapping to rename columns
#                 df_mapped = df.rename(columns=col_mapping)

#                 print(f"✅ Feature names mapped for {df_name}")
#                 mapped_dataframes[df_name] = df_mapped
#                 break  # Stop looping once category is matched

#     return mapped_dataframes

# # Example usage:
# mapped_dataframes = map_feature_names(cleaned_dataframes, required_columns)

In [None]:
# import re

# # Ensure all column names are lowercase and replace spaces & special characters
# def clean_column_names(df):
#     df.columns = df.columns.str.lower().str.replace(r"\s+", "_", regex=True).str.replace(r"[^\w_]", "", regex=True)
#     return df

# # Apply cleaning dynamically to filtered DataFrames instead of the original `dataframes`
# cleaned_dataframes = {name: clean_column_names(df) for name, df in cleaned_dataframes.items()}

# print("✅ Column names cleaned successfully across all cleaned dataframes!")

# # Ensure duplicates & missing values are removed while storing cleaned versions
# final_cleaned_dataframes = {}

# for df_name, df in cleaned_dataframes.items():
#     print(f"\n🟢 Processing {df_name}...")

#     # Make a copy to prevent unintended modifications
#     df = df.copy()

#     # Display initial shape
#     initial_shape = df.shape
#     print(f"📌 Initial shape: {initial_shape}")

#     # Remove duplicate rows
#     duplicates = df.duplicated().sum()
#     if duplicates > 0:
#         print(f"⚠️ Found {duplicates} duplicate rows. Removing...")
#         df.drop_duplicates(inplace=True)
#     else:
#         print("✅ No duplicate rows found.")

#     # Detect & drop columns with 100% missing values, except specific columns
#     missing_cols = df.columns[df.isnull().mean() == 1]
#     exception_keywords = ["coupés", "coupants", "011ec_lot", "012ec_ouvrage", "013ec_localisation", "014ec_mode_constructif"]
#     cols_to_keep = [col for col in missing_cols if any(keyword in col.lower() for keyword in exception_keywords)]
#     cols_to_drop = [col for col in missing_cols if col not in cols_to_keep]

#     if len(cols_to_drop) > 0:
#         print(f"⚠️ Dropping {len(cols_to_drop)} completely empty columns: {list(cols_to_drop)}")
#         df.drop(columns=cols_to_drop, inplace=True)
#     else:
#         print("✅ No fully missing columns detected (or all are exceptions).")

#     # Store final cleaned DataFrame
#     final_cleaned_dataframes[df_name] = df

#     # Display final shape after cleaning
#     final_shape = df.shape
#     print(f"📌 Final shape after cleaning: {final_shape}")

In [None]:
# TARGET_COLUMNS = ['011ec_lot', '012ec_ouvrage', '013ec_localisation', '014ec_mode_constructif']
# # Identify target columns dynamically across all DataFrames
# target_columns_found = set()

# for df_name, df in cleaned_dataframes.items():
#     print(f"\nProcessing dataframe: {df_name}")
#     initial_shape = df.shape  # Store the initial shape of the dataframe

#     # Check for missing target columns
#     for target in TARGET_COLUMNS:
#         target_column_name = f"{df_name.split('_')[-1].lower()}_{target.lower()}"  # Respect naming policy
#         if target_column_name not in df.columns:
#             print(f"⚠️ Target column '{target_column_name}' does not exist in dataframe '{df_name}'.")
#             # Add the missing target column with default values (e.g., NaN)
#             df[target_column_name] = float('nan')
#             print(f"✅ Added missing target column '{target_column_name}' to dataframe '{df_name}'.")

#     final_shape = df.shape  # Store the final shape of the dataframe
#     if initial_shape != final_shape:
#         print(f"📊 Shape before: {initial_shape}, Shape after: {final_shape}")

#     # Display the target columns in the dataframe
#     target_columns_in_df = [col for col in df.columns if any(target.lower() in col.lower() for target in TARGET_COLUMNS)]
#     print(f"🎯 Target columns in '{df_name}': {target_columns_in_df}")

#     # Update the cleaned_dataframes dictionary
#     cleaned_dataframes[df_name] = df

# # Display all detected target columns across datasets
# print(f"\nTarget columns detected across datasets: {target_columns_found}")


In [None]:
# Remove duplicates and 100% missing columns (with exceptions)
def process_dataframe(df_name, exception_keywords):
    # Access the dataframe from cleaned_dataframes
    df = cleaned_dataframes[df_name].copy()

    # Remove duplicate rows
    initial_shape = df.shape
    df.drop_duplicates(inplace=True)
    duplicates_removed = initial_shape[0] - df.shape[0]
    if duplicates_removed > 0:
        print(f"🟢 Removed {duplicates_removed} duplicate rows from {df_name}.")
    else:
        print(f"✅ No duplicate rows found in {df_name}.")

    # Identify fully missing columns
    missing_cols = df.columns[df.isnull().mean() == 1]
    # Keep columns that contain an exception keyword or are in target_columns intact
    cols_to_drop = [
        col for col in missing_cols
        if not any(keyword in col.lower() for keyword in exception_keywords)
    ]

    if cols_to_drop:
        print(f"🟠 Dropping fully missing columns from {df_name}: {cols_to_drop}")
        df.drop(columns=cols_to_drop, inplace=True)
    else:
        print(f"✅ No columns dropped from {df_name}; all fully missing columns are exceptions.")

    # Fill missing values in columns containing "coupé" or "coupants"
    columns_to_fill = [col for col in df.columns if "coupé" in col.lower() or "coupants" in col.lower()]
    if columns_to_fill:
        print(f"🔵 Filling missing values with 0 for columns in {df_name}: {columns_to_fill}")
        df[columns_to_fill] = df[columns_to_fill].fillna(0)

    return df

# Apply processing on all dataframes in cleaned_dataframes
exception_keywords = ["coupés", "coupants"]

processed_dataframes = {}
for df_name in cleaned_dataframes.keys():
    print(f"\n🔍 Processing dataframe: {df_name}")
    processed_dataframes[df_name] = process_dataframe(df_name, exception_keywords)
    final_shape = processed_dataframes[df_name].shape
    print(f"✅ Final shape of {df_name}: {final_shape}")

In [None]:
# Identify target columns dynamically across all DataFrames
TARGET_COLUMNS = ['011EC_Lot', '012EC_Ouvrage', '013EC_Localisation', '014EC_Mode Constructif']

# Check and add missing target columns
for df_name, df in processed_dataframes.items():
    print(f"\nProcessing dataframe: {df_name}")
    initial_shape = df.shape  # Store the initial shape of the dataframe

    for target in TARGET_COLUMNS:
        if target in df.columns:
            print(f"✅ Target column '{target}' found in dataframe '{df_name}'.")

            # Check for missing data in the target column
            missing_count = df[target].isnull().sum()
            total_count = len(df)
            missing_percentage = (missing_count / total_count) * 100
            if missing_count > 0:
                print(f"⚠️ Target column '{target}' has {missing_count} missing values ({missing_percentage:.2f}%).")

                # Drop rows if missing data is less than 10%
                if missing_percentage < 10:
                    df = df[df[target].notnull()]
                    print(f"✅ Dropped rows with missing values in '{target}' (less than 10%).")
            else:
                print(f"✅ Target column '{target}' has no missing values.")
        else:
            print(f"⚠️ Target column '{target}' does not exist in dataframe '{df_name}'. Adding it...")
            # Add the missing target column with default values (e.g., NaN)
            df[target] = float('nan')
            print(f"✅ Added missing target column '{target}' to dataframe '{df_name}'.")

    final_shape = df.shape  # Store the final shape of the dataframe
    if initial_shape != final_shape:
        print(f"📊 Shape before: {initial_shape}, Shape after: {final_shape}")

    # Update the cleaned_dataframes dictionary
    cleaned_dataframes[df_name] = processed_dataframes[df_name]


<!-- ### Exploratory Data Analysis (EDA) -->

In [None]:
# Ensure missing values are filled in the processed datasets unless in TARGET_COLUMNS
for df_name, df in cleaned_dataframes.items():
    print(f"\n🟢 Filling missing values for {df_name}...")

    # Display shape before filling missing values
    initial_shape = df.shape
    print(f"📌 Initial shape before filling NaN: {initial_shape}")

    # Fill missing values with 0 for non-target columns
    non_target_columns = [col for col in df.columns if col not in TARGET_COLUMNS]
    df[non_target_columns] = df[non_target_columns].fillna(0)

    # Store updated dataframe back
    cleaned_dataframes[df_name] = df

    # Display shape after processing
    final_shape = df.shape
    print(f"✅ Final shape after filling NaN: {final_shape}")

print("🚀 Missing values successfully handled across all datasets!")

## EDA - Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

# Function to remove low-variance & highly correlated features
def optimize_feature_selection(df, variance_threshold=0.02, correlation_threshold=0.98):
    print(f"\n🔍 Processing {df.shape[0]} rows & {df.shape[1]} columns")

    # Step 1: Remove Low-Variance Features
    selector = VarianceThreshold(variance_threshold)
    numeric_df = df.select_dtypes(include=["number"])  # Focus only on numerical columns
    selector.fit(numeric_df)

    low_variance_cols = numeric_df.columns[~selector.get_support()]
    keep_cols = [col for col in low_variance_cols if any(keyword in col.lower() for keyword in ["coupés", "coupants"])]
    drop_cols = [col for col in low_variance_cols if col not in keep_cols and col not in TARGET_COLUMNS]

    df.drop(columns=drop_cols, inplace=True)
    print(f"⚠️ Dropped {len(drop_cols)} low-variance columns (excluding 'coupés' and target columns): {drop_cols}")

    # Step 2: Remove Highly Correlated Features
    numeric_df = df.select_dtypes(include=["number"])
    correlation_matrix = numeric_df.corr().abs()
    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
    correlated_features = [
        col for col in upper_triangle.columns
        if any(upper_triangle[col] > correlation_threshold) and col not in TARGET_COLUMNS
    ]

    df.drop(columns=correlated_features, inplace=True)
    print(f"⚠️ Dropped {len(correlated_features)} highly correlated columns (excluding target columns): {correlated_features}")

    print(f"✅ Final shape after filtering: {df.shape}")
    return df

# Apply optimized feature selection to all datasets
final_cleaned_dataframes = {name: optimize_feature_selection(df) for name, df in cleaned_dataframes.items()}

print("🚀 Optimized feature selection completed successfully!")

In [None]:
# Display basic statistics for all cleaned sheets
for df_name, df in final_cleaned_dataframes.items():
    print(f"\nSummary statistics for {df_name}:")

    print(df.describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms for numerical columns
for df_name, df in final_cleaned_dataframes.items():
    df.hist(figsize=(15,10), bins=20)
    plt.suptitle(f"Distribution of Features in {df_name}")
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Compute correlation matrices for numeric columns
for df_name, df in final_cleaned_dataframes.items():
    numeric_df = df.select_dtypes(include=["number"])
    correlation_matrix = numeric_df.corr()

    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
    plt.title(f"Correlation Matrix for {df_name}")
    plt.show()

In [None]:
import os

# Ensure base plots directory exists
if not os.path.exists(PLOTS_DIR):
    os.makedirs(PLOTS_DIR)

# Function to generate subfolder paths for each Excel file
def get_plot_subfolder(file_name):
    subfolder_name = f"{file_name.replace('.xlsx', '').replace('.xls', '')}_Plots"
    subfolder_path = os.path.join(PLOTS_DIR, subfolder_name)

    # Create the subfolder if it doesn't exist
    if not os.path.exists(subfolder_path):
        os.makedirs(subfolder_path)

    return subfolder_path

# Save histograms
for df_name, df in cleaned_dataframes.items():
    plt.figure(figsize=(15, 10))
    df.hist(bins=20)
    plt.suptitle(f"Distribution of Features in {df_name}")

    # Extract the corresponding Excel file name
    file_name = df_name.split("_")[0]  # Extracts RawData_Cibles.xlsx from "RawData_Cibles.xlsx_Murs"
    plot_subfolder = get_plot_subfolder(file_name)

    # Define save path
    plot_path = os.path.join(plot_subfolder, f"{df_name}_histogram.png")
    plt.savefig(plot_path)
    print(f"Saved histogram in: {plot_path}")

    plt.close()

# Save correlation matrices
for df_name, df in cleaned_dataframes.items():
    numeric_df = df.select_dtypes(include=["number"])
    correlation_matrix = numeric_df.corr()

    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
    plt.title(f"Correlation Matrix for {df_name}")

    # Extract Excel file name and subfolder
    file_name = df_name.split("_")[0]
    plot_subfolder = get_plot_subfolder(file_name)

    # Define save path
    plot_path = os.path.join(plot_subfolder, f"{df_name}_correlation.png")
    plt.savefig(plot_path)
    print(f"Saved correlation matrix in: {plot_path}")

    plt.close()

<!-- ### Feature Selection -->

In [None]:
# %pip install ipywidgets

In [None]:
# Identify target columns dynamically across all DataFrames
target_columns_found = set()
for df_name, df in final_cleaned_dataframes.items():
    found_targets = [
        col for col in df.columns
        if any(target.lower() in col.lower() for target in TARGET_COLUMNS)
    ]
    target_columns_found.update(found_targets)

print(f"\nTarget columns detected across datasets: {target_columns_found}")

In [None]:
import os

# Function to generate subfolder paths for storing SHAP plots
def get_plot_subfolder(file_name):
    subfolder_name = f"{file_name}_Plots"
    subfolder_path = os.path.join(PLOTS_DIR, subfolder_name)

    # Create the subfolder if it doesn't exist
    if not os.path.exists(subfolder_path):
        os.makedirs(subfolder_path)

    return subfolder_path

# ✅ Ensure SHAP is applied to the fully processed dataset
final_shap_dataframes = final_cleaned_dataframes  # Use the cleaned dataset after variance/correlation removal

print("🚀 SHAP analysis will now use the final processed data!")

for target_column in TARGET_COLUMNS:
    for df_name, df in final_cleaned_dataframes.items():
        # Check if any column in the dataframe contains the target column name as a substring
        matching_columns = [col for col in df.columns if target_column in col]
        if matching_columns:
            unique_values = df[matching_columns[0]].nunique()
            print(f"{target_column} in {df_name} has {unique_values} unique values.")
        else:
            print(f"{target_column} not found in {df_name}.")

In [None]:
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import os

# Ensure SHAP initializes properly
shap.initjs()

# Function to create subfolder for SHAP plots
def get_plot_subfolder(file_name):
    subfolder_name = f"{file_name}_Plots"
    subfolder_path = os.path.join("plots", subfolder_name)  # Adjust path as needed

    # Create subfolder if it doesn’t exist
    if not os.path.exists(subfolder_path):
        os.makedirs(subfolder_path)

    return subfolder_path

# Loop through all fully processed dataframes
for df_name, df in final_cleaned_dataframes.items():  # ✅ Use fully processed dataset
    print(f"\nProcessing SHAP for {df_name}...")

    # Identify available target columns in the current dataframe
    # Identify target columns by checking if any target column is a substring of the dataframe's columns
    existing_target_columns = target_columns_found

    if existing_target_columns:
        print(f"🎯 Target columns found in {df_name}: {existing_target_columns}")

        for target_column in existing_target_columns:
            print(f"🔍 Analyzing SHAP for target: {target_column}")

            # Display shape before training
            initial_shape = df.shape
            print(f"📌 Initial shape before SHAP processing: {initial_shape}")

            # Prepare the data
            X = df.drop(columns=existing_target_columns)  # Exclude target columns from features

            # Convert categorical columns in X to numeric
            for col in X.select_dtypes(include=["object"]).columns:
                X[col] = X[col].astype("category").cat.codes

            # Convert target column to numeric
            y = df[target_column].astype("category").cat.codes

            # Train RandomForestClassifier
            model = RandomForestClassifier()
            model.fit(X, y)

            # Compute SHAP values
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X)

            # Verify SHAP output shape before plotting
            print(f"📊 SHAP values shape: {len(shap_values)}, Feature matrix shape: {X.shape}")
            if isinstance(shap_values, list):
                shap_values = shap_values[0]  # Use first class for visualization in multi-class models

            # Create subfolder for SHAP plots
            plot_subfolder = get_plot_subfolder(f"SHAP_{df_name}")
            plot_path = os.path.join(plot_subfolder, f"{target_column}_SHAP.png")

            # Display & save SHAP summary plot
            shap.summary_plot(shap_values, X, show=False)
            plt.savefig(plot_path)
            plt.show()

            print(f"✅ Saved SHAP plot for {target_column} in: {plot_path}")

            # Display shape after SHAP analysis
            final_shape = X.shape
            print(f"📌 Final shape after SHAP processing: {final_shape}")

    else:
        print(f"⚠️ No valid target columns found in {df_name}. Skipping...")

<!-- ## Training and testing  -->

<!-- Where to Go from Here?
🔹 Feature engineering: If accuracy is low, refine features further
🔹 Hyperparameter tuning: Optimize n_estimators, max_depth, etc.
🔹 Compare multiple models: Try XGBoost or SVM for better performance

Your Next Step
Run the model training and evaluation, then let me know if you'd like tuning suggestions or deeper insights! 🚀🔥
This is getting exciting—you're building something powerful! 💡 -->


## Machine Learning Models

In [None]:
# Iterate over each dataframe in the dictionary and print its dtypes
for df_name, df in final_cleaned_dataframes.items():
	print(f"Dtypes for {df_name}:")
	print(df.dtypes)
	print("\n")

In [None]:
import pandas as pd

# Function to convert ID strings into a numeric count feature
def count_ids(id_string):
    """Convert string of IDs into a numeric count."""
    return len(id_string.split(",")) if isinstance(id_string, str) else 0

# Apply processing to fully cleaned datasets
for df_name, df in final_cleaned_dataframes.items():
    print(f"\n🔄 Processing ID count transformation for {df_name}...")

    # Identify relevant ID columns
    id_columns = [col for col in df.columns if any(keyword in col.lower() for keyword in ["coupés_(ids)", "coupants_(ids)"])]

    if id_columns:
        print(f"📌 Found ID columns: {id_columns}")

        # Transform ID columns into numeric count and drop originals
        df[[f"{col}_count" for col in id_columns]] = df[id_columns].applymap(count_ids)
        df.drop(columns=id_columns, inplace=True)  # Remove original text-based ID columns

    # Ensure only ID-related columns are converted to numeric
    df[id_columns] = df[id_columns].apply(pd.to_numeric, errors="coerce").fillna(0)

    # Store the updated dataframe
    final_cleaned_dataframes[df_name] = df

    print(f"✅ Final shape after ID count transformation: {df.shape}")

print("🚀 ID count transformation completed successfully!")

In [None]:
# Iterate over each dataframe in the dictionary and print its dtypes
for df_name, df in final_cleaned_dataframes.items():
	print(f"Dtypes for {df_name}:")
	print(df.dtypes)
	print("\n")

In [None]:
for df_name, df in final_cleaned_dataframes.items():
    categorical_cols = df.select_dtypes(include=["object"]).columns
    print(f"\n📌 {df_name} - Categorical Columns Before Encoding: {categorical_cols.tolist()}")

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Dictionaries to store encoders
feature_encoders = {}  # Stores encoders for feature columns
target_encoders = {}  # Stores encoders for target columns

# Encode features and targets separately
for df_name, df in final_cleaned_dataframes.items():
    print(f"\n🔄 Encoding categorical features for {df_name}...")

    # Identify categorical columns again after ID transformation
    categorical_cols = df.select_dtypes(include=["object"]).columns
    target_cols = [col for col in categorical_cols if col in TARGET_COLUMNS]
    feature_cols = [col for col in categorical_cols if col not in TARGET_COLUMNS]

    # Encode target columns
    for col in target_cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col].astype(str))
        target_encoders[f"{df_name}_{col}"] = encoder
        print(f"✅ Stored Target Encoder for {df_name} - {col}")

    # Encode feature columns
    one_hot_cols = []
    for col in feature_cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col].astype(str))
        feature_encoders[f"{df_name}_{col}"] = encoder
        print(f"✅ Stored Feature Encoder for {df_name} - {col}")
        one_hot_cols.append(col)  # Mark column for One-Hot Encoding if needed

    # Apply One-Hot Encoding only to select categorical variables
    if one_hot_cols:
        encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        encoded_values = encoder.fit_transform(df[one_hot_cols])

        # Create a new DataFrame with proper column names
        encoded_df = pd.DataFrame(encoded_values, index=df.index, columns=encoder.get_feature_names_out(one_hot_cols))

        # Remove original one-hot columns and add encoded features
        df.drop(columns=one_hot_cols, inplace=True)
        df = pd.concat([df, encoded_df], axis=1)

    # Save updated dataframe
    final_cleaned_dataframes[df_name] = df

    print(f"✅ Successfully encoded categorical features for {df_name}. New shape: {df.shape}")

print("🚀 Final categorical encoding applied successfully across all datasets!")

In [None]:
for df_name, df in final_cleaned_dataframes.items():
    categorical_cols = df.select_dtypes(include=["object"]).columns
    print(f"\n📌 {df_name} - Categorical Columns After Encoding: {categorical_cols.tolist()}")

In [None]:
# print(train_test_data.keys())

In [None]:
# # RandomForestClassifier by Excel

# import matplotlib.pyplot as plt
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score

# # Train and evaluate models on each dataset
# model_results = {}

# for key, (X_train, X_test, y_train, y_test) in train_test_data.items():
#     print(f"\n🚀 Training model for {key}...")

#     # Train a Random Forest model
#     model = RandomForestClassifier(n_estimators=100, random_state=42)
#     model.fit(X_train, y_train)

#     # Predictions
#     y_train_pred = model.predict(X_train)  # Predictions on training data
#     y_test_pred = model.predict(X_test)  # Predictions on test data

#     # Evaluate accuracy
#     train_accuracy = accuracy_score(y_train, y_train_pred)
#     test_accuracy = accuracy_score(y_test, y_test_pred)
#     model_results[key] = (train_accuracy, test_accuracy)

#     print(f"📊 Training Accuracy for {key}: {train_accuracy:.4f}")
#     print(f"🎯 Test Accuracy for {key}: {test_accuracy:.4f}")

# # Plot Learning Curve
# plt.figure(figsize=(10, 5))

# train_accs = [train_accuracy for train_accuracy, _ in model_results.values()]
# test_accs = [test_accuracy for _, test_accuracy in model_results.values()]
# datasets = list(model_results.keys())

# plt.plot(datasets, train_accs, marker='o', label="Training Accuracy", color="blue")
# plt.plot(datasets, test_accs, marker='s', label="Test Accuracy", color="red")

# plt.xticks(rotation=90)
# plt.ylabel("Accuracy")
# plt.title("Model Learning Curve Across Datasets")
# plt.legend()
# plt.grid(True)
# plt.show()

In [None]:
# import pandas as pd
# import numpy as np
# import os
# import matplotlib.pyplot as plt
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
# from sklearn.metrics import accuracy_score
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler

# # Ensure necessary directories exist
# os.makedirs('data/processed_data', exist_ok=True)

# def train_random_forest(X_combined, y_combined):
#     """Trains a single Random Forest model on processed data with evaluation plots."""
#     print("\n🔍 Checking for missing values...")
#     imputer = SimpleImputer(strategy='mean')
#     X_combined = pd.DataFrame(imputer.fit_transform(X_combined), columns=X_combined.columns)
#     X_combined.dropna(inplace=True)
#     y_combined.dropna(inplace=True)

#     # Apply feature scaling
#     scaler = StandardScaler()
#     X_combined = pd.DataFrame(scaler.fit_transform(X_combined), columns=X_combined.columns)

#     # Split dataset
#     X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

#     print(f"\n🚀 Training on merged dataset with {X_train.shape[0]} samples...")

#     # Train Random Forest model
#     model = RandomForestClassifier(n_estimators=200, random_state=42)
#     model.fit(X_train, y_train)

#     # Predictions
#     y_train_pred = model.predict(X_train)
#     y_test_pred = model.predict(X_test)

#     # Evaluate accuracy
#     train_accuracy = accuracy_score(y_train, y_train_pred)
#     test_accuracy = accuracy_score(y_test, y_test_pred)

#     print(f"📊 Training Accuracy: {train_accuracy:.4f}")
#     print(f"🎯 Test Accuracy: {test_accuracy:.4f}")

#     # Plot Accuracy Comparison
#     plt.figure(figsize=(6, 4))
#     plt.bar(["Train Accuracy", "Test Accuracy"], [train_accuracy, test_accuracy], color=['blue', 'red'])
#     plt.ylabel("Accuracy")
#     plt.title("Model Performance - Random Forest")
#     plt.grid(True)
#     plt.show()

#     # Generate Learning Curve
#     train_sizes, train_scores, test_scores = learning_curve(
#         model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1,
#         train_sizes=np.linspace(0.1, 1.0, 5)
#     )

#     # Compute mean and standard deviation
#     train_mean = np.mean(train_scores, axis=1)
#     train_std = np.std(train_scores, axis=1)
#     test_mean = np.mean(test_scores, axis=1)
#     test_std = np.std(test_scores, axis=1)

#     # Plot Learning Curve
#     plt.figure(figsize=(8, 5))
#     plt.plot(train_sizes, train_mean, 'o-', color="blue", label="Training Accuracy")
#     plt.plot(train_sizes, test_mean, 'o-', color="red", label="Validation Accuracy")
#     plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="blue")
#     plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color="red")
#     plt.xlabel("Training Set Size")
#     plt.ylabel("Accuracy")
#     plt.title("Learning Curve - Random Forest")
#     plt.legend(loc="best")
#     plt.grid()
#     plt.show()

# def process_data(cleaned_dataframes, target_columns):
#     """
#     From each cleaned df:
#       • extract y = df[target_columns]
#       • extract X = df.drop(columns=target_columns)
#     then stack them all into one big X, y pair.
#     """
#     X_parts = []
#     y_parts = []

#     for df_name, df in cleaned_dataframes.items():
#         # check that all target columns are present
#         missing = set(target_columns) - set(df.columns)
#         if missing:
#             print(f"⚠️  Skipping {df_name}: missing targets {missing}")
#             continue

#         # isolate X, y
#         y = df[target_columns]
#         X = df.drop(columns=target_columns)

#         print(f"✅  {df_name}: X shape {X.shape}, y shape {y.shape}")
#         X_parts.append(X)
#         y_parts.append(y)

#     if not X_parts:
#         raise ValueError("No dataframes contained all target columns!")

#     # concat all the pieces
#     X_combined = pd.concat(X_parts, axis=0, ignore_index=True)
#     y_combined = pd.concat(y_parts, axis=0, ignore_index=True)

#     print(f"\n📊 Combined dataset: X {X_combined.shape}, y {y_combined.shape}")
#     return X_combined, y_combined

# # Run training function
# X_combined, y_combined = process_data(final_cleaned_dataframes, TARGET_COLUMNS)

# # train & evaluate
# train_random_forest(X_combined, y_combined)

In [None]:
# import pandas as pd
# import numpy as np
# import os
# import matplotlib.pyplot as plt
# import seaborn as sns
# import joblib
# from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# from sklearn.impute import SimpleImputer
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier

# # Create necessary directories
# os.makedirs('data/processed_data', exist_ok=True)

# # Define multiple ML models
# models = {
#     "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
#     "SVM": SVC(kernel="rbf", probability=True, random_state=42),
#     "KNN": KNeighborsClassifier(n_neighbors=3),
#     "Decision Tree": DecisionTreeClassifier(random_state=42, max_depth=5),
#     "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
#     "AdaBoost": AdaBoostClassifier(n_estimators=50, random_state=42),
#     "Gradient Boosting": GradientBoostingClassifier(n_estimators=50, random_state=42),
# }

# failed_models = []

# def process_data(final_cleaned_dataframes, TARGET_COLUMNS):
#     """Detects missing values, merges all datasets, resets index, and saves processed data."""
#     all_X = []
#     all_y = []

#     print("\n✅ Checking available dataframes and target columns...")
#     print("Dataframes found:", list(final_cleaned_dataframes.keys()))
#     print("Target columns expected:", TARGET_COLUMNS)

#     for df_name, df in final_cleaned_dataframes.items():
#         existing_target_columns = [col for col in df.columns if any(target in col for target in TARGET_COLUMNS)]

#         if not existing_target_columns:
#             print(f"⚠️ {df_name}: No matching target columns found.")
#             continue

#         print(f"\n🔍 Processing {df_name} - Found target columns: {existing_target_columns}")

#         for target_column in existing_target_columns:
#             print(f"📌 Processing data for target: {target_column}")

#             X = df.drop(columns=existing_target_columns)
#             y = df[target_column]

#             if y.nunique() == 1:
#                 print(f"⚠️ Skipping {df_name}_{target_column}: Only one class present.")
#                 continue

#             X = X.reset_index(drop=True)
#             y = y.reset_index(drop=True)

#             all_X.append(X)
#             all_y.append(y)

#     print(f"\n✅ Total datasets processed: {len(all_X)}")

#     if not all_X or not all_y:
#         msg = "🚨 No objects to concatenate. Check TARGET_COLUMNS or ensure target values vary."
#         print(msg)
#         raise ValueError(msg)

#     X_combined = pd.concat(all_X, axis=0).reset_index(drop=True)
#     y_combined = pd.concat(all_y, axis=0).reset_index(drop=True)

#     print(f"\n✅ Final merged dataset shape: {X_combined.shape}, {y_combined.shape}")

#     return X_combined, y_combined

# def train_models(X_combined, y_combined):
#     """Trains multiple ML models & evaluates performance."""
#     print("\n🔍 Checking for NaN values...")
#     imputer = SimpleImputer(strategy='mean')
#     X_combined = pd.DataFrame(imputer.fit_transform(X_combined), columns=X_combined.columns)
#     X_combined.dropna(inplace=True)
#     y_combined.dropna(inplace=True)

#     X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

#     model_results = {}

#     plt.figure(figsize=(8, 5))

#     for name, model in models.items():
#         print(f"\n🚀 Training {name}...")
#         try:
#             cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
#             model.fit(X_train, y_train)
#             y_pred = model.predict(X_test)

#             test_accuracy = accuracy_score(y_test, y_pred)
#             model_results[name] = test_accuracy

#             print(f"✅ {name}: Test Accuracy = {test_accuracy:.4f}")

#             # Learning Curve
#             train_sizes, train_scores, test_scores = learning_curve(
#                 model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1,
#                 train_sizes=np.linspace(0.1, 1.0, 5)
#             )

#             test_mean = np.mean(test_scores, axis=1)
#             test_std = np.std(test_scores, axis=1)

#             plt.plot(train_sizes, test_mean, marker='o', label=f"{name} (Acc: {test_mean[-1]:.2f})")
#             plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)

#         except Exception as e:
#             print(f"⚠️ Error training {name}: {e}")
#             failed_models.append(name)

#     plt.xlabel("Training Set Size")
#     plt.ylabel("Accuracy")
#     plt.title("Learning Curve - All Models")
#     plt.legend(loc="best")
#     plt.grid()
#     plt.show()

#     # Rank models
#     print("\n📊 Model Rankings by Test Accuracy:")
#     ranked_models = sorted(model_results.items(), key=lambda x: x[1], reverse=True)
#     ranking_df = pd.DataFrame(ranked_models, columns=["Model", "Test Accuracy"])
#     print(ranking_df.to_string(index=False))

#     # Save the top models
#     best_models = ranked_models[:2]
#     combined_X_train, combined_y_train = X_combined, y_combined

#     for name, acc in best_models:
#         model = models[name]
#         model.fit(combined_X_train, combined_y_train)
#         joblib.dump(model, f'models/machine_learning/{name.replace(" ", "_")}_combined.pkl')

#     print("\n🚀 Model evaluation, ranking, and saving completed!")
#     print(f"⚠️ Models that failed: {failed_models}")

# # Run the pipeline
# X_combined, y_combined = process_data(final_cleaned_dataframes, TARGET_COLUMNS)
# train_models(X_combined, y_combined)

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Create necessary directories
os.makedirs('data/processed_data', exist_ok=True)

# Define ML models (Updated Logistic Regression)
models = {
    "Logistic Regression": LogisticRegression(solver="saga", max_iter=5000, random_state=42),
    "SVM": SVC(kernel="rbf", probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "Decision Tree": DecisionTreeClassifier(random_state=42, max_depth=5),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=50, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=50, random_state=42),
}

failed_models = []

def process_data(final_cleaned_dataframes, TARGET_COLUMNS):
    """Detects missing values, merges all datasets, resets index, and saves processed data."""
    all_X = []
    all_y = []

    print("\n✅ Checking available dataframes and target columns...")
    print("Dataframes found:", list(final_cleaned_dataframes.keys()))
    print("Target columns expected:", TARGET_COLUMNS)

    for df_name, df in final_cleaned_dataframes.items():
        existing_target_columns = [col for col in df.columns if any(target in col for target in TARGET_COLUMNS)]

        if not existing_target_columns:
            print(f"⚠️ {df_name}: No matching target columns found.")
            continue

        print(f"\n🔍 Processing {df_name} - Found target columns: {existing_target_columns}")

        for target_column in existing_target_columns:
            print(f"📌 Processing data for target: {target_column}")

            X = df.drop(columns=existing_target_columns)
            y = df[target_column]

            if y.nunique() == 1:
                print(f"⚠️ Skipping {df_name}_{target_column}: Only one class present.")
                continue

            X = X.reset_index(drop=True)
            y = y.reset_index(drop=True)

            all_X.append(X)
            all_y.append(y)

    print(f"\n✅ Total datasets processed: {len(all_X)}")

    if not all_X or not all_y:
        msg = "🚨 No objects to concatenate. Check TARGET_COLUMNS or ensure target values vary."
        print(msg)
        raise ValueError(msg)

    X_combined = pd.concat(all_X, axis=0).reset_index(drop=True)
    y_combined = pd.concat(all_y, axis=0).reset_index(drop=True)

    print(f"\n✅ Final merged dataset shape: {X_combined.shape}, {y_combined.shape}")

    return X_combined, y_combined

def train_models(X_combined, y_combined):
    """Trains multiple ML models & evaluates performance."""
    print("\n🔍 Checking for NaN values...")
    imputer = SimpleImputer(strategy='mean')
    X_combined = pd.DataFrame(imputer.fit_transform(X_combined), columns=X_combined.columns)
    X_combined.dropna(inplace=True)
    y_combined.dropna(inplace=True)

    # Apply Standard Scaling
    scaler = StandardScaler()
    X_combined = pd.DataFrame(scaler.fit_transform(X_combined), columns=X_combined.columns)

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

    model_results = {}

    plt.figure(figsize=(8, 5))

    for name, model in models.items():
        print(f"\n🚀 Training {name}...")
        try:
            cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            test_accuracy = accuracy_score(y_test, y_pred)
            model_results[name] = test_accuracy

            print(f"✅ {name}: Test Accuracy = {test_accuracy:.4f}")

            # Learning Curve
            train_sizes, train_scores, test_scores = learning_curve(
                model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1,
                train_sizes=np.linspace(0.1, 1.0, 5)
            )

            test_mean = np.mean(test_scores, axis=1)
            test_std = np.std(test_scores, axis=1)

            plt.plot(train_sizes, test_mean, marker='o', label=f"{name} (Acc: {test_mean[-1]:.2f})")
            plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)

        except Exception as e:
            print(f"⚠️ Error training {name}: {e}")
            failed_models.append(name)

    plt.xlabel("Training Set Size")
    plt.ylabel("Accuracy")
    plt.title("Learning Curve - All Models")
    plt.legend(loc="best")
    plt.grid()
    plt.show()

    # Rank models
    print("\n📊 Model Rankings by Test Accuracy:")
    ranked_models = sorted(model_results.items(), key=lambda x: x[1], reverse=True)
    ranking_df = pd.DataFrame(ranked_models, columns=["Model", "Test Accuracy"])
    print(ranking_df.to_string(index=False))

    # Save the top models
    best_models = ranked_models[:2]
    combined_X_train, combined_y_train = X_combined, y_combined

    for name, acc in best_models:
        model = models[name]
        model.fit(combined_X_train, combined_y_train)
        joblib.dump(model, os.path.join(ML_MODELS_DIR, f'{name.replace(" ", "_")}_combined.pkl'))

    print("\n🚀 Model evaluation, ranking, and saving completed!")
    print(f"⚠️ Models that failed: {failed_models}")

# Run the pipeline
X_combined, y_combined = process_data(final_cleaned_dataframes, TARGET_COLUMNS)
train_models(X_combined, y_combined)

In [None]:
# %pip install xgboost
# %pip install lightgbm

In [None]:
# print("Unique values in y_train:", np.unique(y_train))
# print("Unique values in y_test:", np.unique(y_test))

In [None]:
import numpy as np
import os
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.model_selection import cross_val_score, train_test_split, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Define models and search spaces for Bayesian tuning
param_spaces = {
    "Random Forest": {
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(3, 30),
        'min_samples_split': Integer(2, 15),
        'max_features': Categorical(['sqrt', 'log2', None])
    },
    "Logistic Regression": {
        'C': Real(0.01, 10, prior='log-uniform'),
        'solver': Categorical(['liblinear', 'lbfgs', 'saga'])
    },
    "SVM": {
        'C': Real(0.1, 10, prior='log-uniform'),
        'gamma': Real(0.01, 1, prior='log-uniform'),
        'kernel': Categorical(['linear', 'rbf'])
    },
    "KNN": {
        'n_neighbors': Integer(3, 15),
        'weights': Categorical(['uniform', 'distance']),
        'metric': Categorical(['euclidean', 'manhattan', 'minkowski'])
    },
    "Decision Tree": {
        'max_depth': Integer(3, 20),
        'min_samples_split': Integer(2, 10)
    },
    "AdaBoost": {
        'n_estimators': Integer(50, 500),
        'learning_rate': Real(0.01, 1, prior='log-uniform')
    },
    "Gradient Boosting": {
        'n_estimators': Integer(50, 500),
        'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
        'max_depth': Integer(3, 15)
    }
}

models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=5000, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

failed_models = []

def optimize_model(model, param_space, X_train, y_train):
    """Bayesian optimization of hyperparameters."""
    opt = BayesSearchCV(
        model,
        param_space,
        n_iter=50,  # Number of optimization steps
        cv=3,
        scoring='accuracy',
        random_state=42,
        n_jobs=-1
    )
    opt.fit(X_train, y_train)
    return opt.best_estimator_, opt.best_score_

def plot_learning_curve(model, X_train, y_train, dataset_name, model_name):
    """Plots learning curves for all models in one graph."""
    try:
        train_sizes, train_scores, test_scores = learning_curve(
            model, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1,
            train_sizes=np.linspace(0.2, 0.8, 4)
        )

        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)
        test_std = np.std(test_scores, axis=1)

        plt.plot(train_sizes, test_mean, marker='o', label=f"{model_name} (Acc: {test_mean[-1]:.2f})")
        plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)

    except Exception as e:
        print(f"⚠️ Error generating learning curve for {model_name}: {e}")
        failed_models.append(model_name)

def tune_all_models(X_train, y_train, dataset_name):
    """Runs hyperparameter optimization, saves models, and ranks them."""
    best_models = {}

    plt.figure(figsize=(10, 6))  # Single plot for all learning curves

    for model_name, param_space in param_spaces.items():
        print(f"\n🔍 Optimizing {model_name} for {dataset_name}...")
        best_model, best_score = optimize_model(models[model_name], param_space, X_train, y_train)
        best_models[model_name] = (best_model, best_score)

        print(f"✅ Best {model_name}: Test Accuracy = {best_score:.4f}")
        joblib.dump(best_model, os.path.join(ML_MODELS_DIR, f'{model_name.replace(" ", "_")}_optimized.pkl'))


        # Plot learning curves for all models in the same graph
        plot_learning_curve(best_model, X_train, y_train, dataset_name, model_name)

    # Rank models
    ranked_models = sorted(best_models.items(), key=lambda x: x[1][1], reverse=True)
    print(f"\n📊 Ranking for {dataset_name}:")
    ranking_df = pd.DataFrame({
        "Model": [model for model, _ in ranked_models],
        "Test Accuracy": [metrics[1] for _, metrics in ranked_models]
    })
    print(ranking_df.to_string(index=False))

    plt.xlabel("Training Set Size")
    plt.ylabel("Accuracy")
    plt.title(f"Learning Curve - Optimized Models ({dataset_name})")
    plt.legend(loc="best")
    plt.grid()
    plt.savefig(f'plots/{dataset_name}_optimized_learning_curves.png')
    plt.show()
    plt.close()

    return ranked_models

# Example Data (Replace with Real `train_test_data`)
if "train_test_data" not in globals():
    from sklearn.datasets import load_iris
    iris = load_iris()
    X, y = pd.DataFrame(iris.data, columns=iris.feature_names), pd.Series(iris.target)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    train_test_data = {"iris_dataset": (X_train, X_test, y_train, y_test)}

# Run model tuning
for dataset_name, (X_train, X_test, y_train, y_test) in train_test_data.items():
    ranked_models = tune_all_models(X_train, y_train, dataset_name)

print("\n🚀 Hyperparameter tuning, ranking, and saving completed!")
print(f"⚠️ Models that failed: {failed_models}")

## Deep-Learning Section

In [None]:
# %pip install scikit-optimize

<!-- ## Deep Learning -->

In [None]:
print(f"Available columns in dataset: {list(df.columns)}")
print(f"TARGET_COLUMNS: {TARGET_COLUMNS}")

existing_target_columns = [col for col in df.columns if col.strip().lower() in [t.lower() for t in TARGET_COLUMNS]]
print(f"Unique values in target column: {df[existing_target_columns[0]].nunique()}" if existing_target_columns else "No target column found.")

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

# Dictionaries to store encoders
feature_encoders = {}  # Stores encoders for feature columns
target_encoders = {}   # Stores encoders for target columns

USE_ONE_HOT = False

# Iterate over the clean dataframes
for df_name, df in final_cleaned_dataframes.items():
    print(f"\n🔄 Encoding categorical features for {df_name}...")

    # Identify categorical columns in the DataFrame
    categorical_cols = df.select_dtypes(include=["object"]).columns

    # Split columns between target and features based on TARGET_COLUMNS
    target_cols = [col for col in categorical_cols if col in TARGET_COLUMNS]
    feature_cols = [col for col in categorical_cols if col not in TARGET_COLUMNS]

    # ---------------------------------------------
    # Encode target columns (for classification tasks you may later decide
    # to one-hot encode these as needed for your deep learning model)
    # ---------------------------------------------
    for col in target_cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col].astype(str))
        target_encoders[f"{df_name}_{col}"] = encoder
        print(f"✅ Stored Target Encoder for {df_name} - {col}")

    # ---------------------------------------------
    # Encode feature columns
    # For deep learning, integer encoding is generally preferred so that
    # embeddings can be used (especially if the cardinality isn’t extremely low).
    # ---------------------------------------------
    for col in feature_cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col].astype(str))
        feature_encoders[f"{df_name}_{col}"] = encoder
        print(f"✅ Stored Feature Encoder for {df_name} - {col}")

    # ---------------------------------------------
    # Optionally apply One-Hot Encoding to features if desired.
    # For deep learning models using embeddings, you would typically keep the integer encoding.
    # ---------------------------------------------
    if USE_ONE_HOT:
        one_hot_cols = feature_cols  # you can modify this list if you want one-hot on a subset
        if one_hot_cols:
            encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
            encoded_values = encoder.fit_transform(df[one_hot_cols])
            encoded_df = pd.DataFrame(
                encoded_values,
                index=df.index,
                columns=encoder.get_feature_names_out(one_hot_cols)
            )
            # Drop the original one-hot columns and add the encoded columns
            df.drop(columns=one_hot_cols, inplace=True)
            df = pd.concat([df, encoded_df], axis=1)
            print(f"✅ Applied One-Hot Encoding for columns: {one_hot_cols}")

    # Save the updated DataFrame back
    final_cleaned_dataframes[df_name] = df
    print(f"✅ Successfully encoded categorical features for {df_name}. New shape: {df.shape}")

print("🚀 Final categorical encoding applied successfully across all datasets!")

In [None]:
import pandas as pd
import numpy as np
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Iterate over all cleaned dataframes and target columns
for df_name, df in final_cleaned_dataframes.items():
    print(f"\n🔄 Processing dataframe: {df_name}")

    # Ensure TARGET_COLUMNS exists in the dataframe
    missing_targets = set(TARGET_COLUMNS) - set(df.columns)
    if missing_targets:
        print(f"⚠️ Missing target columns in {df_name}: {missing_targets}. Skipping...")
        continue

    for target_col in TARGET_COLUMNS:
        print(f"🎯 Training model for target column: {target_col}")

        # Prepare features and target
        X = df.drop(columns=TARGET_COLUMNS).fillna(0)
        y = df[target_col].fillna(0)

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Build and compile the model
        num_features = X_train_scaled.shape[1]
        num_classes = len(np.unique(y_train))
        model = keras.Sequential([
            keras.layers.Dense(128, activation='relu', input_shape=(num_features,)),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(64, activation='relu'),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(num_classes, activation='softmax')
        ])
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        # Train the model
        # Ensure labels are within the valid range
        y_train = y_train.clip(0, num_classes - 1)
        y_test = y_test.clip(0, num_classes - 1)

        history = model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, validation_data=(X_test_scaled, y_test))

        # Plot Training and Validation Accuracy
        plt.plot(history.history['accuracy'], label='Training Accuracy', marker='o')
        plt.plot(history.history['val_accuracy'], label='Validation Accuracy', marker='o')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.title(f'Training Curve for {df_name} - {target_col}')
        plt.legend()
        plt.grid(True)
        plt.show()


In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Ensure directories exist
os.makedirs('models/deep_learning', exist_ok=True)
os.makedirs('plots', exist_ok=True)

# Ensure train_test_data exists
if "train_test_data" not in globals():
    raise ValueError("🚨 No train-test data found! Make sure previous models processed data correctly.")

# Load train-test splits from ML models
dataset_name = list(train_test_data.keys())[0]  # Use first dataset
X_train, X_test, y_train, y_test = train_test_data[dataset_name]

# Function to process text features
def process_text_features(X_train, X_test):
    text_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
    print(f"📌 Text columns detected: {text_columns}")

    if text_columns:
        vectorizer = TfidfVectorizer(max_features=500)
        X_train_text = pd.DataFrame(vectorizer.fit_transform(X_train[text_columns].fillna("").apply(lambda x: " ".join(x), axis=1)).toarray(), index=X_train.index)
        X_test_text = pd.DataFrame(vectorizer.transform(X_test[text_columns].fillna("").apply(lambda x: " ".join(x), axis=1)).toarray(), index=X_test.index)
    else:
        X_train_text = pd.DataFrame(index=X_train.index)
        X_test_text = pd.DataFrame(index=X_test.index)

    return X_train_text, X_test_text

# Function to scale numerical features
def preprocess_numerical_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train.select_dtypes(exclude=["object"])), columns=X_train.select_dtypes(exclude=["object"]).columns, index=X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test.select_dtypes(exclude=["object"])), columns=X_test.select_dtypes(exclude=["object"]).columns, index=X_test.index)

    return X_train_scaled, X_test_scaled

# Process features
X_train_text, X_test_text = process_text_features(X_train, X_test)
X_train_scaled, X_test_scaled = preprocess_numerical_features(X_train, X_test)

# Combine text and numerical features
X_train_combined = np.hstack([X_train_scaled, X_train_text])
X_test_combined = np.hstack([X_test_scaled, X_test_text])

print(f"✅ Final X_train_combined shape: {X_train_combined.shape}")
print(f"✅ Final X_test_combined shape: {X_test_combined.shape}")

# Define deep learning models
deep_models = {
    "DNN": keras.Sequential([
        keras.layers.Dense(128, activation='relu', input_shape=(X_train_combined.shape[1],)),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(len(set(y_train)), activation='softmax')
    ]),
    "Wide & Deep": keras.Sequential([
        keras.layers.Dense(64, activation='relu', input_shape=(X_train_combined.shape[1],)),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.4),
        keras.layers.Dense(len(set(y_train)), activation='softmax')
    ]),
    "TabNet": keras.Sequential([
        keras.layers.Dense(256, activation='relu', input_shape=(X_train_combined.shape[1],)),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(len(set(y_train)), activation='softmax')
    ])
}

# Train models and evaluate performance
model_results = {}
plt.figure(figsize=(8, 5))

for name, model in deep_models.items():
    print(f"\n🚀 Training {name}...")

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train_combined, y_train, epochs=20, batch_size=32, validation_data=(X_test_combined, y_test), verbose=0)

    final_val_accuracy = max(history.history['val_accuracy'])
    model_results[name] = final_val_accuracy

    print(f"✅ {name}: Best Validation Accuracy = {final_val_accuracy:.4f}")

    # Plot learning curves
    plt.plot(history.history['val_accuracy'], marker='o', label=f"{name} (Acc: {final_val_accuracy:.2f})")

    # Save trained models
    model.save(os.path.join(DL_MODELS_DIR, f'{name}_best_model.keras'))
best_model.model.save(os.path.join(DL_MODELS_DIR, 'best_model_tuned.keras'))

# Ranking models based on validation accuracy
ranked_models = sorted(model_results.items(), key=lambda x: x[1], reverse=True)
ranking_df = pd.DataFrame(ranked_models, columns=["Model", "Validation Accuracy"])
print(f"\n📊 Model Ranking:\n{ranking_df.to_string(index=False)}")

plt.xlabel("Epochs")
plt.ylabel("Validation Accuracy")
plt.title("Deep Learning Model Comparison")
plt.legend(loc="best")
plt.grid()
plt.savefig(f'plots/deep_learning_learning_curves.png')
plt.show()

print("\n🚀 Deep Learning Model Training, Ranking, and Saving Completed!")

In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import matplotlib.pyplot as plt

# Ensure directories exist
os.makedirs('models/deep_learning', exist_ok=True)
os.makedirs('plots', exist_ok=True)

# Load train-test splits from ML models
dataset_name = list(train_test_data.keys())[0]  # Use first dataset
X_train, X_test, y_train, y_test = train_test_data[dataset_name]

# Process text features
def process_text_features(X_train, X_test):
    text_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
    print(f"📌 Text columns detected: {text_columns}")

    if text_columns:
        vectorizer = TfidfVectorizer(max_features=500)
        X_train_text = pd.DataFrame(
            vectorizer.fit_transform(
                X_train[text_columns].fillna("").apply(lambda x: " ".join(x), axis=1)
            ).toarray(),
            index=X_train.index
        )
        X_test_text = pd.DataFrame(
            vectorizer.transform(
                X_test[text_columns].fillna("").apply(lambda x: " ".join(x), axis=1)
            ).toarray(),
            index=X_test.index
        )
    else:
        X_train_text = pd.DataFrame(index=X_train.index)
        X_test_text = pd.DataFrame(index=X_test.index)

    return X_train_text, X_test_text

# Scale numerical features
def preprocess_numerical_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train.select_dtypes(exclude=["object"])),
        columns=X_train.select_dtypes(exclude=["object"]).columns,
        index=X_train.index
    )
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test.select_dtypes(exclude=["object"])),
        columns=X_test.select_dtypes(exclude=["object"]).columns,
        index=X_test.index
    )

    return X_train_scaled, X_test_scaled

# Process categorical features
def process_categorical_features(X_train, X_test):
    categorical_columns = X_train.select_dtypes(include=["category", "object"]).columns.tolist()
    print(f"📌 Categorical columns detected: {categorical_columns}")

    if categorical_columns:
        encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        X_train_encoded = pd.DataFrame(
            encoder.fit_transform(X_train[categorical_columns]),
            columns=encoder.get_feature_names_out(categorical_columns),
            index=X_train.index
        )
        X_test_encoded = pd.DataFrame(
            encoder.transform(X_test[categorical_columns]),
            columns=encoder.get_feature_names_out(categorical_columns),
            index=X_test.index
        )
    else:
        X_train_encoded = pd.DataFrame(index=X_train.index)
        X_test_encoded = pd.DataFrame(index=X_test.index)

    return X_train_encoded, X_test_encoded

# Prepare data
X_train_text, X_test_text = process_text_features(X_train, X_test)
X_train_scaled, X_test_scaled = preprocess_numerical_features(X_train, X_test)
X_train_cat, X_test_cat = process_categorical_features(X_train, X_test)

# Combine all features
X_train_combined = pd.concat([X_train_scaled, X_train_cat, X_train_text], axis=1)
X_test_combined = pd.concat([X_test_scaled, X_test_cat, X_test_text], axis=1)

# Define hyperparameter search space
param_space = {
    'units_1': Integer(32, 256),
    'units_2': Integer(16, 128),
    'dropout_1': Real(0.1, 0.5),
    'dropout_2': Real(0.1, 0.5),
    'learning_rate': Real(1e-4, 1e-2, prior='log-uniform'),
    'batch_size': Integer(16, 128)
}

# Function to build model dynamically
def build_model(units_1, units_2, dropout_1, dropout_2, learning_rate):
    model = keras.Sequential([
        keras.layers.Dense(units_1, activation='relu', input_shape=(X_train_combined.shape[1],)),
        keras.layers.Dropout(dropout_1),
        keras.layers.Dense(units_2, activation='relu'),
        keras.layers.Dropout(dropout_2),
        keras.layers.Dense(len(set(y_train)), activation='softmax')
    ])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to optimize hyperparameters
def tune_hyperparameters():
    opt = BayesSearchCV(
        estimator=keras.wrappers.scikit_learn.KerasClassifier(build_fn=build_model, epochs=20, verbose=0),
        search_spaces=param_space,
        n_iter=50,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        random_state=42
    )

    opt.fit(X_train_combined, y_train)
    return opt.best_estimator_, opt.best_params_

# Run hyperparameter tuning
best_model, best_params = tune_hyperparameters()

print(f"\n✅ Best Hyperparameters: {best_params}")

# Train final model
history = best_model.fit(X_train_combined, y_train, epochs=50, batch_size=best_params['batch_size'], validation_data=(X_test_combined, y_test))

# Plot Learning Curve
plt.figure(figsize=(8, 5))
plt.plot(history.history['accuracy'], label='Training Accuracy', marker='o')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title(f'Deep Learning Training Curve - {dataset_name}')
plt.legend()
plt.grid()
plt.savefig(f'plots/deep_learning_tuned_learning_curve.png')
plt.show()

# Save best model
DL_MODELS_DIR = 'models/deep_learning'
best_model.model.save(os.path.join(DL_MODELS_DIR, 'best_model_tuned.keras'))

print("\n🚀 Deep Learning Model Training & Hyperparameter Tuning Completed!")

2025-06-10 14:37:27.466080: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-10 14:37:28.410740: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-06-10 14:37:28.410817: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-06-10 14:37:28.497549: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-10 14:37:30.252880: W tensorflow/stream_executor/platform/de

NameError: name 'train_test_data' is not defined

## Testing on maquettes 


In [None]:
import pandas as pd
import os

def load_excel_files(directory):
    """Loads all Excel files from the specified directory and displays their heads."""
    new_dataframes = {}

    for file in os.listdir(directory):
        if file.endswith(".xlsx") or file.endswith(".xls"):
            file_path = os.path.join(directory, file)
            df_name = os.path.splitext(file)[0]  # Use filename as key
            new_dataframes[df_name] = pd.read_excel(file_path)
            print(f"\n✅ Loaded {file}")
            print(new_dataframes[df_name].head())  # 👀 Show first few rows

    return new_dataframes

new_dataframes = load_excel_files(os.path.join(TESTING_DATA_DIR))

In [None]:
import os
import joblib
from tensorflow import keras

# Load all ML models from ML_MODELS_DIR
ml_models = {}
for file in os.listdir(ML_MODELS_DIR):
    if file.endswith(".pkl"):
        model_name = os.path.splitext(file)[0]
        model_path = os.path.join(ML_MODELS_DIR, file)
        ml_models[model_name] = joblib.load(model_path)

print(f"✅ Loaded ML models: {list(ml_models.keys())}")

# Load all DL models from DL_MODELS_DIR
dl_models = {}
for file in os.listdir(DL_MODELS_DIR):
    if file.endswith(".keras"):
        model_name = os.path.splitext(file)[0]
        model_path = os.path.join(DL_MODELS_DIR, file)
        dl_models[model_name] = keras.models.load_model(model_path)

print(f"✅ Loaded DL models: {list(dl_models.keys())}")

In [None]:
for df_name, df in new_dataframes.items():
	print(f"Available columns in {df_name}:", df.columns.tolist())

In [None]:
def detect_target_columns(df):
    """Automatically detects target columns based on keyword pairs."""
    target_patterns = [("011EC", "Lot"), ("012EC", "Ouvrage"), ("013EC", "Localisation"), ("014EC", "Mode Constructif")]

    detected_targets = [col for col in df.columns if any(k1 in col and k2 in col for k1, k2 in target_patterns)]

    print(f"✅ Detected Target Columns: {detected_targets}")
    return detected_targets

for df_name, df in new_dataframes.items():
    print(f"\n🔍 Processing DataFrame: {df_name}")
    detected_targets = detect_target_columns(df)
    print(f"🎯 Detected Target Columns in {df_name}: {detected_targets}")

In [None]:
import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import accuracy_score, classification_report

def isolate_and_verify_target_columns(new_dataframes, target_columns):
    """Removes target columns and verifies successful isolation."""
    processed_dataframes = {}

    for df_name, df in new_dataframes.items():
        existing_targets = [col for col in target_columns if col in df.columns]

        if not existing_targets:
            print(f"⚠️ No target columns found in {df_name}. Skipping...")
            continue

        print(f"\n🔍 Isolating targets in {df_name}: {existing_targets}")

        # Save actual target values separately
        actual_values = df[existing_targets].copy()

        # Drop target columns from dataset
        df_cleaned = df.drop(columns=existing_targets)

        # Verify removal
        assert all(col not in df_cleaned.columns for col in existing_targets), "❌ Target columns were not properly removed!"

        print(f"✅ Target columns successfully removed from {df_name}")
        print(f"🛠 Cleaned DataFrame Head for {df_name}:\n", df_cleaned.head())

        processed_dataframes[df_name] = (df_cleaned, actual_values)

    return processed_dataframes

# Example usage:
processed_dataframes = isolate_and_verify_target_columns(new_dataframes, detected_targets)

In [None]:
# def align_features(X_test, trained_model):
#     """Ensures the test dataset has the same features as the trained model."""
#     expected_features = trained_model.feature_names_in_

#     # Remove unexpected columns
#     X_test_aligned = X_test[expected_features].copy()

#     print(f"✅ Aligned dataset shape: {X_test_aligned.shape}")
#     return X_test_aligned

# # Example usage:
# # Ensure the file exists before loading
# rf_model_path = os.path.join(ML_MODELS_DIR, "Random_Forest_optimized.pkl")
# if not os.path.exists(rf_model_path):
#     raise FileNotFoundError(f"File not found: {rf_model_path}. Please ensure the model file exists in the specified directory.")

# rf_model = joblib.load(rf_model_path)

# # Define or load X_test (example: using a DataFrame from the `cleaned_dataframes` dictionary)
# X_test = cleaned_dataframes['maquette_23015.xlsx_Sols']  # Replace with the appropriate key

# X_test_aligned = align_features(X_test, rf_model)

In [None]:
## showing the head of the dataframes we gonna use for predictions
for df_name, (df_cleaned, actual_values) in processed_dataframes.items():
    print(f"\n🛠 Cleaned DataFrame Head for {df_name}:\n", df_cleaned.head())
    print(f"🎯 Actual Values Head for {df_name}:\n", actual_values.head())


In [None]:
import pandas as pd
import numpy as np
## the model cant see the target columns, so we need to align the features
def align_features(X_test, trained_model):
    """Ensures the test dataset has the same features as the trained model."""
    expected_features = trained_model.feature_names_in_

    # Remove unexpected columns
    X_test_aligned = X_test[expected_features].copy()

    print(f"✅ Aligned dataset shape: {X_test_aligned.shape}")
    return X_test_aligned

In [None]:
import os
import pandas as pd
import numpy as np
import joblib

def convert_predictions_to_text(numeric_preds, target_map):
    """
    Convert numeric predictions to text using target_map.
    If numeric_preds is 1D, convert each element.
    If numeric_preds is 2D, convert each element and return an array.
    """
    if numeric_preds.ndim == 1:
        return np.array([target_map.get(pred, str(pred)) for pred in numeric_preds])
    else:
        # Use np.vectorize to apply the conversion elementwise.
        return np.vectorize(lambda x: target_map.get(x, str(x)))(numeric_preds)

def predict_and_export(mapped_dataframes, ml_models, target_patterns, target_map, output_dir):
    """
    Performs predictions on each Excel-derived DataFrame in mapped_dataframes.
    - Excludes columns that match any of the target pattern tuples.
    - Aligns features as expected by the model (adding missing ones with value 0).
    - Converts numeric predictions to text using target_map.
    - Exports the resulting DataFrame (with predictions appended) to Excel.

    The export file is saved in output_dir with the modified file name:
    "{original_dataframe_name}_{model_name}.xlsx"
    """


    for df_name, X_test in mapped_dataframes.items():
        print(f"\n🚀 Predicting targets for {df_name}...")

        # Exclude any column whose name contains both parts of any target pattern.
        # The comparison uses lower-case to be case-insensitive.
        excluded_columns = [
            col for col in X_test.columns
            if any(k1.lower() in col.lower() and k2.lower() in col.lower() for k1, k2 in target_patterns)
        ]
        X_test_cleaned = X_test.drop(columns=excluded_columns, errors='ignore')
        print(f"✅ Excluded columns from prediction: {excluded_columns}")

        # For each model, perform prediction and export the results.
        for model_name, model in ml_models.items():
            # Work on a copy so that adding missing columns doesn't persist for later models.
            X_aligned = X_test_cleaned.copy()

            # Add any missing features the model expects (with a default value of 0)
            missing_features = set(model.feature_names_in_) - set(X_aligned.columns)
            for feature in missing_features:
                X_aligned[feature] = 0

            # Ensure the feature order matches what the model expects.
            X_aligned = X_aligned.loc[:, model.feature_names_in_]

            try:
                # Make the predictions (assuming the model outputs numeric labels)
                numeric_preds = model.predict(X_aligned)

                # Convert numeric predictions into text using the provided mapping.
                text_preds = convert_predictions_to_text(numeric_preds, target_map)

                # Prepare a DataFrame of predictions.
                # If predictions are multidimensional, assume each column corresponds to one target.
                if text_preds.ndim > 1:
                    # Create column names based on your target patterns.
                    pred_columns = [f"{k1}_{k2}" for k1, k2 in target_patterns]
                    preds_df = pd.DataFrame(text_preds, columns=pred_columns)
                else:
                    preds_df = pd.DataFrame(text_preds, columns=["Prediction"])

                # Combine the original test DataFrame (for context) with predictions.
                export_df = X_test.copy()  # use the original DataFrame with all columns
                # Append predictions as new columns.
                export_df = pd.concat([export_df, preds_df], axis=1)

                # Set the output file name.
                # For example: if df_name is "maquette_23015" and model_name is "Random_Forest_optimized",
                # the file name becomes "maquette_23015_Random_Forest_optimized.xlsx"
                output_file = os.path.join(PREDICTED_DATA_DIR, f"{df_name}_{model_name}.xlsx")
                export_df.to_excel(output_file, index=False)
                print(f"✅ Exported predictions for {df_name} using {model_name} to {output_file}")
            except Exception as e:
                print(f"⚠️ Error predicting with {model_name} on {df_name}: {e}")


target_patterns = [
    ("011EC", "Lot"),
    ("012EC", "Ouvrage"),
    ("013EC", "Localisation"),
    ("014EC", "Mode Constructif")
]

# For instance, if your model returns 0 for "Type A" and 1 for "Type B":
target_map = {0: "Type A", 1: "Type B"}

# Directory for exporting prediction files:
predicting_data_dir = "predicting_data_dir"

# Now call the function.
# (Ensure that `mapped_dataframes` and `ml_models` are defined in your environment.)
predictions = predict_and_export(mapped_dataframes, ml_models, target_patterns, target_map, predicting_data_dir)

In [None]:
# import numpy as np
# import joblib
# import tensorflow as tf
# from tensorflow import keras
# from sklearn.metrics import accuracy_score, classification_report

# def predict_with_models(processed_dataframes, ml_model_paths, deep_model_paths):
#     """Uses ML & Deep Learning models to predict target values."""
#     predictions = {}

#     for df_name, (X_test, actual_values) in processed_dataframes.items():
#         print(f"\n🚀 Predicting targets for {df_name}...")

#         # Load trained ML models and make predictions
#         ml_preds = {}
#         for model_name, model_path in ml_model_paths.items():
#             model = joblib.load(model_path)
#             ml_preds[model_name] = model.predict(X_test)

#         # Load trained deep learning models and make predictions
#         deep_preds = {}
#         for model_name, model_path in deep_model_paths.items():
#             deep_model = keras.models.load_model(model_path)
#             deep_preds[model_name] = np.argmax(deep_model.predict(X_test), axis=1)

#         predictions[df_name] = {
#             "ML": ml_preds,
#             "Deep Learning": deep_preds,
#             "Actual": actual_values
#         }

#     return predictions

# # Example usage:
# ml_model_paths = {
#     "Random Forest": "models/machine_learning/Random_Forest_optimized.pkl",
#     "Gradient Boosting": "models/machine_learning/Gradient_Boosting_optimized.pkl"
# }

# deep_model_paths = {
#     "Best Deep Model": "models/deep_learning/best_model_tuned.keras"
# }

# predictions = predict_with_models(processed_dataframes, ml_model_paths, deep_model_paths)

In [None]:
print(f"Predictions Structure: {predictions.keys()}")

for df_name, results in predictions.items():
    print(f"\n🔎 Checking {df_name}: {results.keys()}")

In [None]:
def evaluate_predictions(predictions, new_dataframes, detected_targets):
    """Compares predicted vs. actual target values and calculates accuracy."""
    evaluation_results = {}

    for df_name, results in predictions.items():
        print(f"\n📊 Evaluating predictions for {df_name}...")

        # Retrieve the actual values from the original dataframes
        actual_values = new_dataframes[df_name][detected_targets].copy()

        eval_metrics = {}

        # Evaluate ML models
        for model_name, y_pred in results["ML"].items():
            accuracy = accuracy_score(actual_values.values.flatten(), y_pred)
            print(f"✅ {model_name} Accuracy: {accuracy:.4f}")
            eval_metrics[model_name] = accuracy

        # Evaluate Deep Learning models
        for model_name, y_pred in results["Deep Learning"].items():
            accuracy = accuracy_score(actual_values.values.flatten(), y_pred)
            print(f"✅ {model_name} Accuracy: {accuracy:.4f}")
            eval_metrics[model_name] = accuracy

        evaluation_results[df_name] = eval_metrics

    return evaluation_results

# Run evaluation
evaluation_results = evaluate_predictions(predictions, new_dataframes, detected_targets)

<!-- 🔹 BERT or GPT → If text has complex relationships
🔹 LSTM or GRU → If text has sequential meaning
🔹 CNN for NLP → If local word patterns matte


🚀 Machine Learning Models
If your data is structured (numerical/tabular), traditional ML methods may work well: ✅ Decision Trees & Random Forest → Good for structured data, feature importance analysis
✅ Gradient Boosting (XGBoost, LightGBM, CatBoost) → Powerful for tabular data with boosting techniques
✅ Support Vector Machines (SVM) → Great for classification problems
✅ K-Nearest Neighbors (KNN) → Simple but useful for certain cases
✅ Logistic Regression → Best for binary classification
🔥 Deep Learning Architectures
If you have images, text, or highly complex patterns, DL might be a better choice: ✅ Convolutional Neural Networks (CNNs) → Best for image processing
✅ Recurrent Neural Networks (RNNs) & LSTMs → Designed for sequential data (like time series or language models)
✅ Transformers (BERT, GPT) → Cutting-edge for NLP and deep sequence understanding
✅ Autoencoders & GANs → Used for generative tasks or anomaly detection
 -->
