# Bim_Predict NoteBook

## Importing Libraries

In [None]:
# Import libraries
import os
import pandas as pd

# Define project folder paths
# Data directories
BASE_DIR = "../../"
DATA_DIR = os.path.join(BASE_DIR, "data")
RAW_DATA_DIR = os.path.join(DATA_DIR, "raw_data")
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, "processed_data")
PREDICTED_DATA_DIR = os.path.join(DATA_DIR, "predicting_data")
TESTING_DATA_DIR = os.path.join(DATA_DIR, "testing_data")

# Model directories
MODELS_DIR = os.path.join(BASE_DIR, "models")
ML_MODELS_DIR = os.path.join(MODELS_DIR, "SK/machine_learning")
DL_MODELS_DIR = os.path.join(MODELS_DIR, "SK/deep_learning")
OTHER_MODELS_DIR = os.path.join(MODELS_DIR, "SK/other")

# Python modules and plots directories
PYTHON_MODULES_DIR = os.path.join(BASE_DIR, "python_modules")
PLOTS_DIR = os.path.join(BASE_DIR, "plots")

# List of directories to create
directories = [
    RAW_DATA_DIR, PROCESSED_DATA_DIR, PREDICTED_DATA_DIR,
    MODELS_DIR, ML_MODELS_DIR, DL_MODELS_DIR, OTHER_MODELS_DIR,
    PYTHON_MODULES_DIR, PLOTS_DIR
]

# Create directories if they don't exist
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

In [None]:
import os
import pandas as pd

TARGET_COLUMNS = ['011EC_Lot', '012EC_Ouvrage', '013EC_Localisation', '014EC_Mode_Constructif']
exception_keywords = ["coupés", "coupants", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode_Constructif"]

def load_test_data():
    """Load all test data from Excel files in the testing directory."""
    test_files = [f for f in os.listdir(TESTING_DATA_DIR) if f.endswith('.xlsx')]

    test_data = {}

    for file in test_files:
        file_path = os.path.join(TESTING_DATA_DIR, file)
        print(f"📥 Loading test file: {file}...")

        # Load all sheets from Excel file
        xls = pd.ExcelFile(file_path)

        for sheet_name in xls.sheet_names:
            df = pd.read_excel(file_path, sheet_name=sheet_name)

            # Ensure it's a valid DataFrame before storing
            if not df.empty:
                test_data[sheet_name] = {
                    'X': df.drop(columns=TARGET_COLUMNS + ['Id'], errors='ignore'),  # Features
                    'y': df[TARGET_COLUMNS] if set(TARGET_COLUMNS).issubset(df.columns) else None  # Target
                }

        print(f"✅ Loaded {len(xls.sheet_names)} sheets from {file}")

    return test_data

# Step 1: Import data first
test_data = load_test_data()

def clean_column_names(df):
    """Standardize column names by lowercasing and removing special characters."""
    df.columns = (
        df.columns
        .str.lower()
        .str.replace(r"\s+", "_", regex=True)
        .str.replace(r"[^\w_]", "", regex=True)
    )
    return df

def remove_duplicates(df):
    """Drop duplicate rows from DataFrame."""
    df.drop_duplicates(inplace=True)
    return df

def drop_empty_columns(df, exception_keywords):
    """Drop columns that are completely missing unless they match exception keywords."""
    missing_cols = df.columns[df.isnull().mean() == 1]
    cols_to_drop = [col for col in missing_cols if not any(keyword in col.lower() for keyword in exception_keywords)]
    df.drop(columns=cols_to_drop, inplace=True)
    return df



# Step 2: Process the data
processed_test_data = {}
for sheet_name, sheet_dict in test_data.items():
    print(f"🔍 Processing sheet: {sheet_name}")

    if isinstance(sheet_dict['X'], pd.DataFrame) and not sheet_dict['X'].empty:
        X_test = clean_column_names(sheet_dict['X'])
        X_test = remove_duplicates(X_test)
        X_test = drop_empty_columns(X_test, exception_keywords)

        y_test = sheet_dict['y'] if isinstance(sheet_dict['y'], pd.DataFrame) and not sheet_dict['y'].empty else None

        processed_test_data[sheet_name] = {'X': X_test, 'y': y_test}

        print(f"✅ Successfully processed {sheet_name} (Features: {X_test.shape}, Targets: {y_test.shape if y_test is not None else 'N/A'})")
    else:
        print(f"⚠️ 'X' is missing or empty for {sheet_name}, skipping preprocessing.")

# Update test_data with processed results
test_data = processed_test_data

In [None]:
X_test = X_test.fillna(0)

In [None]:
for sheet_name, sheet_dict in test_data.items():
    print(f"{sheet_name} - Available columns: {sheet_dict['X'].columns.tolist()}")

In [None]:
import os
import joblib
from tensorflow import keras

def load_models(models_dir):
    """Load trained ML and DL models."""
    models = {'ML': {}, 'DL': {}}

    # Load ML models
    for model_file in os.listdir(ML_MODELS_DIR):
        if model_file.endswith('.joblib') or model_file.endswith('.pkl'):
            model_name = os.path.splitext(model_file)[0]
            models['ML'][model_name] = joblib.load(os.path.join(ML_MODELS_DIR, model_file))

    # Load DL models
    for model_file in os.listdir(DL_MODELS_DIR):
        model_path = os.path.join(DL_MODELS_DIR, model_file)
        if model_file.endswith('.keras') or model_file.endswith('.h5'):
            model_name = os.path.splitext(model_file)[0]
            models['DL'][model_name] = keras.models.load_model(model_path)
        elif os.path.isdir(model_path):
            models['DL'][model_file] = keras.models.load_model(model_path)

    # Display the number of models and their names
    ml_model_count = len(models['ML'])
    dl_model_count = len(models['DL'])
    print(f"Imported {ml_model_count} ML models: {list(models['ML'].keys())}")
    print(f"Imported {dl_model_count} DL models: {list(models['DL'].keys())}")

    return models

models = load_models(MODELS_DIR)

In [None]:
import os

for model_name in models['ML']:
    preprocessor_path = os.path.join(ML_MODELS_DIR, f"{model_name}_preprocessor.joblib")
    if not os.path.exists(preprocessor_path):
        print(f"⚠️ Missing preprocessor for {model_name}. Did you save it after training?")

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import accuracy_score


def predict_with_models(test_data, models):
    """Make predictions for all models on test data."""
    all_results = {}

    for sheet_name, data in test_data.items():
        print(f"\n🔍 Processing maquette: {sheet_name}")
        # Check if 'X' and 'y' keys exist in the data dictionary
        if 'X' in data and 'y' in data:
            X_test = data['X']
            y_test = data['y']
        else:
            print(f"⚠️ Missing 'X' or 'y' in test data for sheet: {sheet_name}")
            continue
        sheet_results = {}

        for model_name, model in models['ML'].items():
            try:
                print(f"➡️ Predicting with ML model: {model_name}")
                # Apply preprocessing
                preprocessor_path = os.path.join(ML_MODELS_DIR, f"{model_name}_preprocessor.joblib")
                if os.path.exists(preprocessor_path):
                    preprocessor = joblib.load(preprocessor_path)
                    X_processed = preprocessor.transform(X_test)
                    print(f"✅ Preprocessing applied using {model_name}_preprocessor.joblib")
                else:
                    X_processed = X_test  # Fallback if no preprocessor
                    print(f"⚠️ No preprocessor found for {model_name}, using raw test data")

                # Make predictions
                predictions = model.predict(X_processed)

                # Convert predictions to human-readable form
                readable_predictions = predictions.astype(str)  # Could be adjusted to float/int when necessary

                # Evaluate accuracy
                evaluation = accuracy_score(y_test, predictions) if y_test is not None else None

                # Display predictions and evaluation
                print(f"📊 Predictions: {readable_predictions[:5]}... (showing first 5)")
                if evaluation is not None:
                    print(f"📈 Accuracy: {evaluation:.4f}")
                else:
                    print("⚠️ No ground truth provided, skipping evaluation")

                sheet_results[model_name] = {
                    'predictions': readable_predictions,
                    'evaluation': evaluation
                }

            except Exception as e:
                print(f"⚠️ Error predicting with ML model {model_name}: {str(e)}")

        # Store results per sheet
        all_results[sheet_name] = sheet_results

    return all_results


# Call the function and display results
all_results = predict_with_models(test_data, models)
for maquette, results in all_results.items():
    print(f"\n📄 Results for maquette: {maquette}")
    for model_name, result in results.items():
        print(f"➡️ Model: {model_name}")
        print(f"   Predictions: {result['predictions'][:5]}... (showing first 5)")
        if result['evaluation'] is not None:
            print(f"   Accuracy: {result['evaluation']:.4f}")
        else:
            print("   ⚠️ No ground truth provided, skipping evaluation")

In [None]:
def save_predictions_to_excel(all_results):
    """Export all predictions to an Excel file."""
    for sheet_name, results in all_results.items():
        output_dfs = []

        for model_name, result in results.items():
            pred_df = pd.DataFrame(result['predictions'], columns=["Predicted_Value"])
            pred_df["Model"] = model_name
            pred_df["Evaluation_Accuracy"] = result['evaluation']

            output_dfs.append(pred_df)

        if output_dfs:
            combined_df = pd.concat(output_dfs)
            output_path = os.path.join(PREDICTED_DATA_DIR, f"{sheet_name}_predictions.xlsx")
            combined_df.to_excel(output_path, index=False)
            print(f"✅ Saved predictions for {sheet_name} to {output_path}")

save_predictions_to_excel(predict_with_models(test_data, models))