# Bim_Predict_Base

## Importing Libraries

In [None]:
# Import libraries
import os
import pandas as pd

# Define project folder paths
# Data directories
BASE_DIR = "../../"
DATA_DIR = os.path.join(BASE_DIR, "data")
RAW_DATA_DIR = os.path.join(DATA_DIR, "raw_data")
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, "processed_data")
PREDICTED_DATA_DIR = os.path.join(DATA_DIR, "predicting_data")
TESTING_DATA_DIR = os.path.join(DATA_DIR, "testing_data")

# Model directories
MODELS_DIR = os.path.join(BASE_DIR, "models")
ML_MODELS_DIR = os.path.join(MODELS_DIR, "SK/machine_learning")
DL_MODELS_DIR = os.path.join(MODELS_DIR, "SK/deep_learning")
OTHER_MODELS_DIR = os.path.join(MODELS_DIR, "SK/other")

# Python modules and plots directories
PYTHON_MODULES_DIR = os.path.join(BASE_DIR, "python_modules")
PLOTS_DIR = os.path.join(BASE_DIR, "plots")

# List of directories to create
directories = [
    RAW_DATA_DIR, PROCESSED_DATA_DIR, PREDICTED_DATA_DIR,
    MODELS_DIR, ML_MODELS_DIR, DL_MODELS_DIR, OTHER_MODELS_DIR,
    PYTHON_MODULES_DIR, PLOTS_DIR
]

# Create directories if they don't exist
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

In [None]:
import os
import pandas as pd

# List all Excel files in RAW_DATA_DIR
excel_files = [f for f in os.listdir(RAW_DATA_DIR) if f.endswith(".xlsx") or f.endswith(".xls")]

# Dictionary to store DataFrames for each file and sheet
dataframes = {}

# Process each Excel file
for file in excel_files:
    file_path = os.path.join(RAW_DATA_DIR, file)
    print(f"Loading: {file_path}")

    try:
        # Load Excel file
        excel_data = pd.ExcelFile(file_path)

        # Load all sheets dynamically
        for sheet_name in excel_data.sheet_names:
            df = excel_data.parse(sheet_name)

            # Save DataFrame with a unique identifier
            dataframes[f"{file}_{sheet_name}"] = df

    except Exception as e:
        print(f"Error loading {file_path}: {e}")

# Display summary of loaded data
print(f"\nTotal files processed: {len(dataframes)}")
for key, df in dataframes.items():
    print(f"Loaded DataFrame: {key}, Shape: {df.shape}")

In [None]:
# Define required columns dynamically
required_columns = {
    "Murs": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "Hauteur",
             "Epaisseur", "AI", "AS", "Sols en intersection", "Sols coupés (u)", "Sols coupés (Ids)",
             "Sols coupants (u)", "Sols coupants (Ids)", "Sol au-dessus", "Sol en-dessous", "Fenêtres", "Portes",
             "Ouvertures", "Murs imbriqués", "Mur multicouche", "Mur empilé", "Profil modifié", "Extension inférieure",
             "Extension supérieure", "Partie inférieure attachée", "Partie supérieure attachée", "Décalage supérieur",
             "Décalage inférieur", "Matériau structurel"],

    "Sols": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "Murs en intersection",
             "Murs coupés (u)", "Murs coupés (Ids)", "Murs coupants (u)", "Murs coupants (Ids)", "Poutres en intersection",
             "Poutres coupés (u)", "Poutres coupés (Ids)", "Poutres coupants (u)", "Poutres coupants (Ids)",
             "Poteaux en intersection", "Poteaux coupés (u)", "Poteaux coupés (Ids)", "Poteaux coupants (u)",
             "Poteaux coupants (Ids)", "Ouvertures", "Sol multicouche", "Profil modifié", "Décalage par rapport au niveau",
             "Epaisseur", "Lié au volume", "Etude de l'élévation à la base", "Etude de l'élévation en haut",
             "Epaisseur du porteur", "Elévation au niveau du noyau inférieur", "Elévation au niveau du noyau supérieur",
             "Elévation en haut", "Elévation à la base", "Matériau structurel"],

    "Poutres": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "AI", "AS",
                "Hauteur totale", "Hauteur", "Sols en intersection", "Sols coupés (u)", "Sols coupés (Ids)",
                "Sols coupants (u)", "Sols coupants (Ids)", "Sol au-dessus", "Sol en-dessous", "Poteaux en intersection",
                "Poteaux coupés (u)", "Poteaux coupés (Ids)", "Poteaux coupants (u)", "Poteaux coupants (Ids)",
                "Etat de la jonction", "Valeur de décalage Z", "Justification Z", "Valeur de décalage Y", "Justification Y",
                "Justification YZ", "Matériau structurel", "Elévation du niveau de référence", "Elévation en haut",
                "Rotation de la section", "Orientation", "Décalage du niveau d'arrivée", "Décalage du niveau de départ",
                "Elévation à la base", "Longueur de coupe", "Longueur", "hauteur_section", "largeur_section"],

    "Poteaux": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "AI", "AS",
                "Hauteur", "Longueur", "Partie inférieure attachée", "Partie supérieure attachée", "Sols en intersection",
                "Sols coupés (u)", "Sols coupés (Ids)", "Sols coupants (u)", "Sols coupants (Ids)", "Poutres en intersection",
                "Poutres coupés (u)", "Poutres coupés (Ids)", "Poutres coupants (u)", "Poutres coupants (Ids)",
                "Matériau structurel", "Décalage supérieur", "Décalage inférieur", "Diamètre poteau", "h", "b",
                "hauteur_section", "largeur_section"]
}

# Filter multiple dataframes dynamically
cleaned_dataframes = {}  # Store cleaned versions

for df_name, df in dataframes.items():
    print(f"\n🟢 Original shape of {df_name}: {df.shape}")

    # Automatically detect the correct category for filtering
    for category, columns in required_columns.items():
        if category.lower() in df_name.lower():  # Match dynamically
            try:
                filtered_df = df[columns]  # Keep only the required columns
            except KeyError as e:
                missing_columns = set(columns) - set(df.columns)
                print(f"⚠️ Missing columns in {df_name}: {missing_columns}. Skipping this dataframe.")
                continue
            cleaned_dataframes[df_name] = filtered_df
            print(f"✅ Shape after filtering {df_name}: {filtered_df.shape}")
            break  # Stop looping once the correct match is found
    else:
        print(f"⚠️ No matching category for {df_name}, skipping filtering.")

# Add prefixes to column names based on the dataframe category and update index
for name, df in cleaned_dataframes.items():
    if "murs" in name.lower():
        prefix = "murs_"
    elif "sols" in name.lower():
        prefix = "sols_"
    elif "poutres" in name.lower():
        prefix = "poutres_"
    elif "poteaux" in name.lower():
        prefix = "poteaux_"
    else:
        prefix = ""

    # Rename columns with the prefix
    df.rename(columns=lambda col: f"{prefix}{col}" if col.lower() != "id" else f"{prefix}id", inplace=True)

    # Drop the existing index and set the prefixed ID column as the new index
    id_column = f"{prefix}id"
    if id_column in df.columns:
        df.set_index(id_column, inplace=True)
        print(f"✅ Set '{id_column}' as index for {name}.")
    else:
        print(f"⚠️ '{id_column}' column not found in {name}, skipping index setting.")

    # Update the cleaned_dataframes dictionary
    cleaned_dataframes[name] = df

### Feature Engineering


In [None]:
def map_feature_names(cleaned_dataframes, required_columns):
    """Maps cleaned dataframe column names to match required training feature names."""
    mapped_dataframes = {}

    for df_name, df in cleaned_dataframes.items():
        for category, expected_columns in required_columns.items():
            if category.lower() in df_name.lower():  # Match dynamically
                # Create mapping: {cleaned_col_name: expected_col_name}
                col_mapping = {cleaned_col: expected_col for cleaned_col in df.columns for expected_col in expected_columns if cleaned_col.lower() == expected_col.lower()}

                # Apply mapping to rename columns
                df_mapped = df.rename(columns=col_mapping)

                print(f"✅ Feature names mapped for {df_name}")
                mapped_dataframes[df_name] = df_mapped
                break  # Stop looping once category is matched

    return mapped_dataframes

# Example usage:
mapped_dataframes = map_feature_names(cleaned_dataframes, required_columns)

In [None]:
import re

def clean_column_names(df):
    # Ensure all column names are lowercase, replace spaces with underscores, and remove special characters
    df.columns = (
        df.columns
        .str.lower()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^\w_]", "", regex=True)
    )
    return df

# Clean column names in all provided DataFrames
cleaned_dataframes = {name: clean_column_names(df) for name, df in cleaned_dataframes.items()}
print("✅ Column names cleaned successfully across all cleaned dataframes!")

TARGET_COLUMNS = ['011ec_lot', '012ec_ouvrage', '013ec_localisation', '014ec_mode_constructif']
final_cleaned_dataframes = {}
target_columns_found = set()
exception_keywords = ["coupés", "coupants", "011ec_lot", "012ec_ouvrage", "013ec_localisation", "014ec_mode_constructif"]

for df_name, df in cleaned_dataframes.items():
    print(f"\n🟢 Processing {df_name}...")
    df = df.copy()
    initial_shape = df.shape
    print(f"📌 Initial shape: {initial_shape}")

    # Remove duplicate rows
    duplicates = df.duplicated().sum()
    if duplicates:
        print(f"⚠️ Found {duplicates} duplicate rows. Removing...")
        df.drop_duplicates(inplace=True)
    else:
        print("✅ No duplicate rows found.")

    # Drop columns that are 100% missing unless they match exception keywords
    missing_cols = df.columns[df.isnull().mean() == 1]
    cols_to_drop = [col for col in missing_cols if not any(keyword in col.lower() for keyword in exception_keywords)]
    if cols_to_drop:
        print(f"⚠️ Dropping {len(cols_to_drop)} completely empty columns: {cols_to_drop}")
        df.drop(columns=cols_to_drop, inplace=True)
    else:
        print("✅ No fully missing columns detected (or all are exceptions).")

    mid_shape = df.shape

    # Ensure each target column exists, adding it with NaNs if missing (with naming policy)
    for target in TARGET_COLUMNS:
        target_col = f"{df_name.split('_')[-1].lower()}_{target.lower()}"
        if target_col not in df.columns:
            print(f"⚠️ Target column '{target_col}' missing in '{df_name}'. Adding it.")
            df[target_col] = float('nan')

    final_shape = df.shape
    if mid_shape != final_shape:
        print(f"📊 Shape adjustment: before {mid_shape}, after {final_shape}")

    # List and accumulate target columns found in the current DataFrame
    target_cols_in_df = [col for col in df.columns if any(t.lower() in col.lower() for t in TARGET_COLUMNS)]
    print(f"🎯 Target columns in '{df_name}': {target_cols_in_df}")
    target_columns_found.update(target_cols_in_df)

    final_cleaned_dataframes[df_name] = df
    print(f"📌 Final shape after cleaning: {final_shape}")

print(f"\nTarget columns detected across datasets: {target_columns_found}")

In [None]:
# Ensure missing values are filled in the processed datasets unless in TARGET_COLUMNS
for df_name, df in final_cleaned_dataframes.items():
    print(f"\n🟢 Filling missing values for {df_name}...")

    # Display shape before filling missing values
    initial_shape = df.shape
    print(f"📌 Initial shape before filling NaN: {initial_shape}")

    # Fill missing values with 0 for non-target columns
    non_target_columns = [col for col in df.columns if col not in TARGET_COLUMNS]
    df[non_target_columns] = df[non_target_columns].fillna(0)

    # Store updated dataframe back
    final_cleaned_dataframes[df_name] = df

    # Display shape after processing
    final_shape = df.shape
    print(f"✅ Final shape after filling NaN: {final_shape}")

print("🚀 Missing values successfully handled across all datasets!")

## EDA - Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

# Function to remove low-variance & highly correlated features
def optimize_feature_selection(df, variance_threshold=0.02, correlation_threshold=0.98):
    print(f"\n🔍 Processing {df.shape[0]} rows & {df.shape[1]} columns")

    # Step 1: Remove Low-Variance Features
    selector = VarianceThreshold(variance_threshold)
    numeric_df = df.select_dtypes(include=["number"])  # Focus only on numerical columns
    selector.fit(numeric_df)

    low_variance_cols = numeric_df.columns[~selector.get_support()]
    keep_cols = [col for col in low_variance_cols if any(keyword in col.lower() for keyword in ["coupés", "coupants"])]
    drop_cols = [col for col in low_variance_cols if col not in keep_cols and col not in TARGET_COLUMNS]

    df.drop(columns=drop_cols, inplace=True)
    print(f"⚠️ Dropped {len(drop_cols)} low-variance columns (excluding 'coupés' and target columns): {drop_cols}")

    # Step 2: Remove Highly Correlated Features
    numeric_df = df.select_dtypes(include=["number"])
    correlation_matrix = numeric_df.corr().abs()
    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
    correlated_features = [
        col for col in upper_triangle.columns
        if any(upper_triangle[col] > correlation_threshold) and col not in TARGET_COLUMNS
    ]

    df.drop(columns=correlated_features, inplace=True)
    print(f"⚠️ Dropped {len(correlated_features)} highly correlated columns (excluding target columns): {correlated_features}")

    print(f"✅ Final shape after filtering: {df.shape}")
    return df

# Apply optimized feature selection to all datasets
final_cleaned_dataframes = {name: optimize_feature_selection(df) for name, df in final_cleaned_dataframes.items()}

print("🚀 Optimized feature selection completed successfully!")

In [None]:
# Display basic statistics for all cleaned sheets
for df_name, df in cleaned_dataframes.items():
    print(f"\nSummary statistics for {df_name}:")

    print(df.describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms for numerical columns
for df_name, df in cleaned_dataframes.items():
    df.hist(figsize=(15,10), bins=20)
    plt.suptitle(f"Distribution of Features in {df_name}")
    plt.show()

In [None]:
# Compute correlation matrices for numeric columns
for df_name, df in cleaned_dataframes.items():
    numeric_df = df.select_dtypes(include=["number"])
    correlation_matrix = numeric_df.corr()

    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
    plt.title(f"Correlation Matrix for {df_name}")
    plt.show()

In [None]:
import os

# Ensure base plots directory exists
os.makedirs(PLOTS_DIR, exist_ok=True)

# Function to generate subfolder paths for each Excel file
def get_plot_subfolder(file_name):
    subfolder_name = f"{file_name.replace('.xlsx', '').replace('.xls', '')}_Plots"
    subfolder_path = os.path.join(PLOTS_DIR, subfolder_name)
    os.makedirs(subfolder_path, exist_ok=True)
    return subfolder_path

# Helper function to extract file name and subfolder
def get_subfolder_and_path(df_name, suffix):
    file_name = df_name.split("_")[0]
    plot_subfolder = get_plot_subfolder(file_name)
    plot_path = os.path.join(plot_subfolder, f"{df_name}_{suffix}.png")
    return plot_path

# Save histograms and correlation matrices
for df_name, df in cleaned_dataframes.items():
    # Histogram
    plt.figure(figsize=(15, 10))
    df.hist(bins=20)
    plt.suptitle(f"Distribution of Features in {df_name}")
    plt.savefig(get_subfolder_and_path(df_name, "histogram"))
    print(f"Saved histogram in: {get_subfolder_and_path(df_name, 'histogram')}")
    plt.close()

    # Correlation matrix
    numeric_df = df.select_dtypes(include=["number"])
    correlation_matrix = numeric_df.corr()

    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
    plt.title(f"Correlation Matrix for {df_name}")
    plt.savefig(get_subfolder_and_path(df_name, "correlation"))
    print(f"Saved correlation matrix in: {get_subfolder_and_path(df_name, 'correlation')}")
    plt.close()

## Data Engineering

In [None]:
# Identify target columns dynamically across all DataFrames
target_columns_found = set()
for df_name, df in final_cleaned_dataframes.items():
    found_targets = [
        col for col in df.columns
        if any(target.lower() in col.lower() for target in TARGET_COLUMNS)
    ]
    target_columns_found.update(found_targets)

print(f"\nTarget columns detected across datasets: {target_columns_found}")

In [None]:
import os
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

# Ensure SHAP initializes properly
shap.initjs()

# Function to generate subfolder paths for storing SHAP plots
def get_plot_subfolder(file_name, base_dir="plots"):
    subfolder_path = os.path.join(base_dir, f"{file_name}_Plots")
    os.makedirs(subfolder_path, exist_ok=True)  # Creates folder only if it doesn't exist
    return subfolder_path

# ✅ Ensure SHAP is applied to the fully processed dataset
final_shap_dataframes = final_cleaned_dataframes  # Using cleaned dataset after variance/correlation removal
print("🚀 SHAP analysis will now use the final processed data!")

for df_name, df in final_shap_dataframes.items():
    print(f"\n🟢 Processing SHAP for {df_name}...")

    # Identify available target columns (substring match with TARGET_COLUMNS)
    existing_target_columns = [col for col in df.columns if any(target in col.lower() for target in TARGET_COLUMNS)]

    if not existing_target_columns:
        print(f"⚠️ No valid target columns found in {df_name}. Skipping...")
        continue

    print(f"🎯 Target columns found in {df_name}: {existing_target_columns}")

    for target_column in existing_target_columns:
        print(f"🔍 Analyzing SHAP for target: {target_column}")
        initial_shape = df.shape
        print(f"📌 Initial shape before SHAP processing: {initial_shape}")

        # Prepare the feature matrix and target variable
        X = df.drop(columns=existing_target_columns)
        X = X.apply(lambda col: col.astype("category").cat.codes if col.dtypes == "object" else col)
        y = df[target_column].astype("category").cat.codes

        # Train RandomForest model
        model = RandomForestClassifier()
        model.fit(X, y)

        # Compute SHAP values
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)
        shap_values = shap_values[0] if isinstance(shap_values, list) else shap_values

        # Generate and save SHAP plot
        plot_subfolder = get_plot_subfolder(f"SHAP_{df_name}")
        plot_path = os.path.join(plot_subfolder, f"{target_column}_SHAP.png")
        shap.summary_plot(shap_values, X, show=False)
        plt.savefig(plot_path)
        plt.show()

        print(f"✅ Saved SHAP plot for {target_column} in: {plot_path}")

    print(f"📌 Final shape after SHAP processing: {X.shape}")

In [None]:
# Iterate over each dataframe in the dictionary and print its dtypes
for df_name, df in final_cleaned_dataframes.items():
	print(f"Dtypes for {df_name}:")
	print(df.dtypes)
	print("\n")

In [None]:
import pandas as pd

# Function to convert ID strings into a numeric count feature
def count_ids(id_string):
    """Convert string of IDs into a numeric count."""
    return len(id_string.split(",")) if isinstance(id_string, str) else 0

# Apply processing to fully cleaned datasets
for df_name, df in final_cleaned_dataframes.items():
    print(f"\n🔄 Processing ID count transformation for {df_name}...")

    # Identify relevant ID columns
    id_columns = [col for col in df.columns if any(keyword in col.lower() for keyword in ["coupés_(ids)", "coupants_(ids)"])]

    if id_columns:
        print(f"📌 Found ID columns: {id_columns}")

        # Transform ID columns into numeric count and drop originals
        df[[f"{col}_count" for col in id_columns]] = df[id_columns].applymap(count_ids)
        df.drop(columns=id_columns, inplace=True)  # Remove original text-based ID columns

    # Ensure only ID-related columns are converted to numeric
    df[id_columns] = df[id_columns].apply(pd.to_numeric, errors="coerce").fillna(0)

    # Store the updated dataframe
    final_cleaned_dataframes[df_name] = df

    print(f"✅ Final shape after ID count transformation: {df.shape}")

print("🚀 ID count transformation completed successfully!")

In [None]:
# Iterate over each dataframe in the dictionary and print its dtypes
for df_name, df in final_cleaned_dataframes.items():
	print(f"Dtypes for {df_name}:")
	print(df.dtypes)
	print("\n")

In [None]:
for df_name, df in final_cleaned_dataframes.items():
    categorical_cols = df.select_dtypes(include=["object"]).columns
    print(f"\n📌 {df_name} - Categorical Columns Before Encoding: {categorical_cols.tolist()}")

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

# Initialize dictionaries to store encoders
feature_encoders, target_encoders = {}, {}

print("🚀 Applying categorical encoding across all datasets...")

for df_name, df in final_cleaned_dataframes.items():
    print(f"\n🔄 Processing {df_name}...")

    # Identify categorical columns
    categorical_cols = df.select_dtypes(include=["object"]).columns
    target_cols = [col for col in categorical_cols if col in TARGET_COLUMNS]
    feature_cols = list(set(categorical_cols) - set(target_cols))  # Exclude target columns

    # Encode target columns
    for col in target_cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col].astype(str))
        target_encoders[f"{df_name}_{col}"] = encoder
        print(f"✅ Target Encoder stored for {df_name} - {col}")

    # Encode feature columns using Label Encoding
    one_hot_cols = []
    for col in feature_cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col].astype(str))
        feature_encoders[f"{df_name}_{col}"] = encoder
        print(f"✅ Feature Encoder stored for {df_name} - {col}")
        one_hot_cols.append(col)

    # Apply One-Hot Encoding to relevant categorical features
    if one_hot_cols:
        encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        encoded_df = pd.DataFrame(
            encoder.fit_transform(df[one_hot_cols]),
            index=df.index,
            columns=encoder.get_feature_names_out(one_hot_cols)
        )
        df.drop(columns=one_hot_cols, inplace=True)
        df = pd.concat([df, encoded_df], axis=1)

    # Save the updated dataframe
    final_cleaned_dataframes[df_name] = df
    print(f"✅ Completed categorical encoding for {df_name}. Updated shape: {df.shape}")

print("🎯 Final categorical encoding applied successfully across all datasets!")

In [None]:
for df_name, df in final_cleaned_dataframes.items():
    categorical_cols = df.select_dtypes(include=["object"]).columns
    print(f"\n📌 {df_name} - Categorical Columns After Encoding: {categorical_cols.tolist()}")

## Training and saving the model

### Machine Learning Section

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Define ML models
models = {
    "Logistic Regression": LogisticRegression(solver="saga", max_iter=5000, random_state=42),
    "SVM": SVC(kernel="rbf", probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "Decision Tree": DecisionTreeClassifier(random_state=42, max_depth=5),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=50, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=50, random_state=42),
}

failed_models = []

def process_data(final_cleaned_dataframes, TARGET_COLUMNS):
    """Detects missing values, merges datasets, and resets index."""
    all_X, all_y = [], []
    print("\n✅ Checking available dataframes and target columns...")

    for df_name, df in final_cleaned_dataframes.items():
        existing_targets = [col for col in df.columns if any(target in col for target in TARGET_COLUMNS)]
        if not existing_targets:
            print(f"⚠️ {df_name}: No matching target columns found.")
            continue

        print(f"\n🔍 Processing {df_name} - Found target columns: {existing_targets}")

        for target_column in existing_targets:
            X, y = df.drop(columns=existing_targets), df[target_column]
            if y.nunique() == 1:
                print(f"⚠️ Skipping {df_name}_{target_column}: Only one class present.")
                continue

            all_X.append(X.reset_index(drop=True))
            all_y.append(y.reset_index(drop=True))

    if not all_X or not all_y:
        raise ValueError("🚨 No valid datasets found. Check TARGET_COLUMNS or ensure target values vary.")

    X_combined, y_combined = pd.concat(all_X, axis=0).reset_index(drop=True), pd.concat(all_y, axis=0).reset_index(drop=True)
    print(f"\n✅ Final merged dataset shape: {X_combined.shape}, {y_combined.shape}")
    return X_combined, y_combined

def train_models(X_combined, y_combined):
    """Trains multiple ML models & evaluates performance."""
    print("\n🔍 Handling missing values...")
    X_combined = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(X_combined), columns=X_combined.columns)
    y_combined.dropna(inplace=True)

    # Feature scaling
    scaler = StandardScaler()
    X_combined = pd.DataFrame(scaler.fit_transform(X_combined), columns=X_combined.columns)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

    model_results = {}

    plt.figure(figsize=(8, 5))
    for name, model in models.items():
        print(f"\n🚀 Training {name}...")
        try:
            cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
            model.fit(X_train, y_train)
            test_accuracy = accuracy_score(y_test, model.predict(X_test))
            model_results[name] = test_accuracy

            print(f"✅ {name}: Test Accuracy = {test_accuracy:.4f}")

            # Learning Curve
            train_sizes, train_scores, test_scores = learning_curve(
                model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1,
                train_sizes=np.linspace(0.1, 1.0, 5)
            )
            plt.plot(train_sizes, np.mean(test_scores, axis=1), marker='o', label=f"{name} (Acc: {test_accuracy:.2f})")

        except Exception as e:
            print(f"⚠️ Error training {name}: {e}")
            failed_models.append(name)

    plt.xlabel("Training Set Size")
    plt.ylabel("Accuracy")
    plt.title("Learning Curve - All Models")
    plt.legend(loc="best")
    plt.grid()
    plt.show()

    # Rank models
    ranked_models = sorted(model_results.items(), key=lambda x: x[1], reverse=True)
    print("\n📊 Model Rankings by Test Accuracy:")
    print(pd.DataFrame(ranked_models, columns=["Model", "Test Accuracy"]).to_string(index=False))

    # Save top models
    for name, _ in ranked_models[:2]:
        models[name].fit(X_combined, y_combined)
        joblib.dump(models[name], f'models/machine_learning/{name.replace(" ", "_")}_combined.pkl')

    print("\n🚀 Model evaluation, ranking, and saving completed!")
    print(f"⚠️ Models that failed: {failed_models}")

# Run the pipeline
X_combined, y_combined = process_data(final_cleaned_dataframes, TARGET_COLUMNS)
train_models(X_combined, y_combined)

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import (
    RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Define models and Bayesian hyperparameter search spaces
models = {
    "Random Forest": (RandomForestClassifier(random_state=42), {
        'n_estimators': Integer(100, 1000), 'max_depth': Integer(3, 30),
        'min_samples_split': Integer(2, 15), 'max_features': Categorical(['sqrt', 'log2', None])
    }),
    "Logistic Regression": (LogisticRegression(max_iter=5000, random_state=42), {
        'C': Real(0.01, 10, prior='log-uniform'), 'solver': Categorical(['liblinear', 'lbfgs', 'saga'])
    }),
    "SVM": (SVC(probability=True, random_state=42), {
        'C': Real(0.1, 10, prior='log-uniform'), 'gamma': Real(0.01, 1, prior='log-uniform'),
        'kernel': Categorical(['linear', 'rbf'])
    }),
    "KNN": (KNeighborsClassifier(), {
        'n_neighbors': Integer(3, 15), 'weights': Categorical(['uniform', 'distance']),
        'metric': Categorical(['euclidean', 'manhattan', 'minkowski'])
    }),
    "Decision Tree": (DecisionTreeClassifier(random_state=42), {
        'max_depth': Integer(3, 20), 'min_samples_split': Integer(2, 10)
    }),
    "AdaBoost": (AdaBoostClassifier(random_state=42), {
        'n_estimators': Integer(50, 500), 'learning_rate': Real(0.01, 1, prior='log-uniform')
    }),
    "Gradient Boosting": (GradientBoostingClassifier(random_state=42), {
        'n_estimators': Integer(50, 500), 'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
        'max_depth': Integer(3, 15)
    })
}

def optimize_model(model, param_space, X_train, y_train):
    """Bayesian hyperparameter optimization."""
    opt = BayesSearchCV(model, param_space, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
    opt.fit(X_train, y_train)
    return opt.best_estimator_, opt.best_score_

def train_and_rank_models(X_train, y_train):
    """Tunes models, ranks them, and saves the best two."""
    results = {}

    for name, (model, param_space) in models.items():
        print(f"\n🔍 Optimizing {name}...")
        try:
            best_model, best_score = optimize_model(model, param_space, X_train, y_train)
            results[name] = (best_model, best_score)
            print(f"✅ {name}: Accuracy = {best_score:.4f}")
        except Exception as e:
            print(f"⚠️ {name} failed: {e}")

    # Sort models by accuracy and save the top two
    top_models = sorted(results.items(), key=lambda x: x[1][1], reverse=True)[:2]
    for name, (model, _) in top_models:
        joblib.dump(model, f'models/machine_learning/{name.replace(" ", "_")}_optimized.pkl')

    print("\n🚀 Saved top 2 models:", [name for name, _ in top_models])

# Example dataset
if "train_test_data" not in globals():
    from sklearn.datasets import load_iris
    iris = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(iris.data), pd.Series(iris.target), test_size=0.2, random_state=42)

# Run tuning
train_and_rank_models(X_train, y_train)

### Deep Learning Section

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Ensure dataset exists
if "train_test_data" not in globals():
    raise ValueError("🚨 No train-test data found!")

# Load dataset
dataset_name = list(train_test_data.keys())[0]
X_train, X_test, y_train, y_test = train_test_data[dataset_name]

# Process text features
def process_text(X_train, X_test):
    text_cols = X_train.select_dtypes(include=["object"]).columns
    if not text_cols.empty:
        vectorizer = TfidfVectorizer(max_features=500)
        return (
            pd.DataFrame(vectorizer.fit_transform(X_train[text_cols].fillna("").agg(" ".join, axis=1)).toarray()),
            pd.DataFrame(vectorizer.transform(X_test[text_cols].fillna("").agg(" ".join, axis=1)).toarray())
        )
    return pd.DataFrame(), pd.DataFrame()

# Scale numerical features
def scale_numeric(X_train, X_test):
    scaler = StandardScaler()
    return (
        pd.DataFrame(scaler.fit_transform(X_train.select_dtypes(exclude=["object"]))),
        pd.DataFrame(scaler.transform(X_test.select_dtypes(exclude=["object"])))
    )

# Process and combine features
X_train_text, X_test_text = process_text(X_train, X_test)
X_train_scaled, X_test_scaled = scale_numeric(X_train, X_test)
X_train_combined, X_test_combined = np.hstack([X_train_scaled, X_train_text]), np.hstack([X_test_scaled, X_test_text])

# Define deep learning models
deep_models = {
    "DNN": [128, 64],
    "Wide & Deep": [64, 128],
    "TabNet": [256]
}

def build_model(layers):
    model = keras.Sequential()
    model.add(keras.layers.Dense(layers[0], activation='relu', input_shape=(X_train_combined.shape[1],)))
    for units in layers[1:]:
        model.add(keras.layers.Dense(units, activation='relu'))
        model.add(keras.layers.Dropout(0.3))
    model.add(keras.layers.Dense(len(set(y_train)), activation='softmax'))
    return model

# Train models & track results
model_results = {}
plt.figure(figsize=(8, 5))

for name, layers in deep_models.items():
    model = build_model(layers)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train_combined, y_train, epochs=20, batch_size=32, validation_data=(X_test_combined, y_test), verbose=0)

    best_val_acc = max(history.history['val_accuracy'])
    model_results[name] = best_val_acc
    plt.plot(history.history['val_accuracy'], label=f"{name} ({best_val_acc:.2f})")

# Rank and save top 2 models
top_models = sorted(model_results.items(), key=lambda x: x[1], reverse=True)[:2]
for name, _ in top_models:
    build_model(deep_models[name]).save(f'models/deep_learning/{name}_best_model.keras')

print("\n🚀 Top models saved:", [name for name, _ in top_models])

# Plot comparison
plt.xlabel("Epochs")
plt.ylabel("Validation Accuracy")
plt.title("Deep Learning Model Comparison")
plt.legend()
plt.grid()
plt.savefig('plots/deep_learning_learning_curves.png')
plt.show()

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Load dataset
if "train_test_data" not in globals():
    raise ValueError("🚨 No train-test data found!")

dataset_name = list(train_test_data.keys())[0]
X_train, X_test, y_train, y_test = train_test_data[dataset_name]

# Process text and numerical features
def process_features(X_train, X_test):
    text_cols = X_train.select_dtypes(include=["object"]).columns
    vectorizer = TfidfVectorizer(max_features=500) if not text_cols.empty else None

    X_train_text = pd.DataFrame(vectorizer.fit_transform(X_train[text_cols].fillna("").agg(" ".join, axis=1)).toarray()) if vectorizer else pd.DataFrame()
    X_test_text = pd.DataFrame(vectorizer.transform(X_test[text_cols].fillna("").agg(" ".join, axis=1)).toarray()) if vectorizer else pd.DataFrame()

    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train.select_dtypes(exclude=["object"])))
    X_test_scaled = pd.DataFrame(scaler.transform(X_test.select_dtypes(exclude=["object"])))

    return np.hstack([X_train_scaled, X_train_text]), np.hstack([X_test_scaled, X_test_text])

X_train_combined, X_test_combined = process_features(X_train, X_test)

# Define hyperparameter search space
param_space = {
    'units_1': Integer(64, 256), 'units_2': Integer(32, 128),
    'dropout_1': Real(0.1, 0.5), 'dropout_2': Real(0.1, 0.5),
    'learning_rate': Real(0.0001, 0.01, prior='log-uniform'), 'batch_size': Integer(16, 64)
}

# Build model dynamically
def build_model(units_1, units_2, dropout_1, dropout_2, learning_rate):
    model = keras.Sequential([
        keras.layers.Dense(units_1, activation='relu', input_shape=(X_train_combined.shape[1],)),
        keras.layers.Dropout(dropout_1),
        keras.layers.Dense(units_2, activation='relu'),
        keras.layers.Dropout(dropout_2),
        keras.layers.Dense(len(set(y_train)), activation='softmax')
    ])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Tune hyperparameters
opt = BayesSearchCV(
    estimator=keras.wrappers.scikit_learn.KerasClassifier(build_model),
    search_spaces=param_space, n_iter=50, cv=3, scoring='accuracy',
    n_jobs=-1, random_state=42
)
opt.fit(X_train_combined, y_train)
best_model, best_params = opt.best_estimator_, opt.best_params_

print(f"\n✅ Best Hyperparameters: {best_params}")

# Train final model
history = best_model.fit(X_train_combined, y_train, epochs=50, batch_size=best_params['batch_size'], validation_data=(X_test_combined, y_test))

# Plot learning curve
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'], label='Training Accuracy', marker='o')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title(f'Deep Learning Training Curve - {dataset_name}')
plt.legend()
plt.grid()
plt.savefig('plots/deep_learning_tuned_learning_curve.png')
plt.show()

# Save best model
best_model.model.save('models/deep_learning/best_model_tuned.keras')
print("\n🚀 Deep Learning Model Training & Hyperparameter Tuning Completed!")

## Testing new maquettes

### Importing Testing Data

In [None]:
import pandas as pd
import os

def load_excel_files(directory):
    """Loads all Excel files from the specified directory and displays their heads."""
    new_dataframes = {}

    for file in os.listdir(directory):
        if file.endswith(".xlsx") or file.endswith(".xls"):
            file_path = os.path.join(directory, file)
            df_name = os.path.splitext(file)[0]  # Use filename as key
            new_dataframes[df_name] = pd.read_excel(file_path)
            print(f"\n✅ Loaded {file}")
            print(new_dataframes[df_name].head())  # 👀 Show first few rows

    return new_dataframes

new_dataframes = load_excel_files(TESTING_DATA_DIR)

## Saving and Loading Encoders 

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf

ML_MODELS_DIR = os.path.join(MODELS_DIR, "machine_learning")
DL_MODELS_DIR = os.path.join(MODELS_DIR, "deep_learning")

ENCODERS_PATH = os.path.join(MODELS_DIR, "feature_encoders.pkl")
TARGET_ENCODERS_PATH = os.path.join(MODELS_DIR, "target_encoders.pkl")
MODEL_FEATURES_PATH = os.path.join(MODELS_DIR, "model_features.pkl")

def save_encoders(feature_encoders, target_encoders):
    """Saves feature and target encoders for consistent data preprocessing."""
    joblib.dump(feature_encoders, ENCODERS_PATH)
    joblib.dump(target_encoders, TARGET_ENCODERS_PATH)

def load_encoders():
    """Loads stored feature and target encoders."""
    return (
        joblib.load(ENCODERS_PATH) if os.path.exists(ENCODERS_PATH) else {},
        joblib.load(TARGET_ENCODERS_PATH) if os.path.exists(TARGET_ENCODERS_PATH) else {}
    )

def encode_new_data(X_new, feature_encoders):
    """Encodes categorical features using stored encoders."""
    for col, encoder in feature_encoders.items():
        if col in X_new:
            X_new[col] = encoder.transform(X_new[col].astype(str))
    return X_new

def make_ml_predictions(X_new):
    """Loads and applies all machine learning models."""
    feature_encoders, _ = load_encoders()
    X_encoded = encode_new_data(X_new.copy(), feature_encoders)

    if os.path.exists(MODEL_FEATURES_PATH):
        model_features = joblib.load(MODEL_FEATURES_PATH)
        X_encoded = X_encoded.reindex(columns=model_features, fill_value=0)

    models = {f.replace("_optimized.pkl", "").replace("_combined.pkl", ""): joblib.load(os.path.join(ML_MODELS_DIR, f))
              for f in os.listdir(ML_MODELS_DIR) if f.endswith(".pkl")}

    predictions = {name: model.predict(X_encoded) for name, model in models.items()}
    return max(predictions.items(), key=lambda x: np.mean(x[1]))  # Returns best ML prediction
def make_dl_predictions(X_new):
    """Loads and applies all deep learning models."""
    models = {f.replace("_best_model.keras", "").replace("_tuned.keras", ""): tf.keras.models.load_model(os.path.join(DL_MODELS_DIR, f))
              for f in os.listdir(DL_MODELS_DIR) if f.endswith(".keras")}

    predictions = {name: np.argmax(model.predict(X_new), axis=1) for name, model in models.items()}
    return max(predictions.items(), key=lambda x: np.mean(x[1]))

def predict_best_model(X_new):
    """Runs predictions across ML and DL models and picks the best one."""
    best_ml = make_ml_predictions(X_new)
    best_dl = make_dl_predictions(X_new)

    return best_ml if np.mean(best_ml[1]) > np.mean(best_dl[1]) else best_dl


In [None]:
def predict_all_models(X_new):
    """Runs predictions across ML and DL models and returns all results."""
    predictions = {"ML": {}, "Deep Learning": {}}

    # Machine Learning Predictions
    for name, model in make_ml_predictions(X_new).items():
        predictions["ML"][name] = model.predict(X_new)

    # Deep Learning Predictions
    for name, model in make_dl_predictions(X_new).items():
        predictions["Deep Learning"][name] = np.argmax(model.predict(X_new), axis=1)

    return predictions

# Generate model predictions
predictions = predict_all_models(X_new)

# Run evaluation
evaluation_results = evaluate_predictions(predictions, new_dataframes, detected_targets)

# Print final model rankings based on accuracy
print("\n🚀 Final Model Evaluation Rankings:")
ranking_df = pd.DataFrame([
    (df_name, model_name, acc) for df_name, models in evaluation_results.items() for model_name, acc in models.items()
], columns=["Dataset", "Model", "Accuracy"]).sort_values(by="Accuracy", ascending=False)

print(ranking_df.to_string(index=False))

In [None]:
def decode_predictions(predictions, target_encoders):
    """Decodes encoded model predictions back to original text values."""
    decoded_results = {}

    for df_name, model_results in predictions.items():
        decoded_results[df_name] = {}

        # Decode ML model predictions
        for model_name, encoded_preds in model_results["ML"].items():
            decoded_results[df_name][model_name] = target_encoders[df_name].inverse_transform(encoded_preds)

        # Decode Deep Learning model predictions
        for model_name, encoded_preds in model_results["Deep Learning"].items():
            decoded_results[df_name][model_name] = target_encoders[df_name].inverse_transform(encoded_preds)

    return decoded_results

In [None]:
# Generate predictions
predictions = predict_all_models(X_new)

# Run evaluation
evaluation_results = evaluate_predictions(predictions, new_dataframes, detected_targets)

# Decode predictions
target_encoders = joblib.load(TARGET_ENCODERS_PATH)  # Load stored target encoders
decoded_results = decode_predictions(predictions, target_encoders)

# Display final decoded predictions
for df_name, models in decoded_results.items():
    print(f"\n📊 Decoded Predictions for {df_name}:")
    for model_name, decoded_preds in models.items():
        print(f"✅ {model_name}: {decoded_preds[:5]}")  # Show first 5 decoded values for readability