# Bim_Predict NoteBook

## Importing Libraries

In [None]:
# Import libraries
import os
import pandas as pd

# Define project folder paths
# Data directories
BASE_DIR = "../../"
DATA_DIR = os.path.join(BASE_DIR, "data")
RAW_DATA_DIR = os.path.join(DATA_DIR, "raw_data")
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, "processed_data")
PREDICTED_DATA_DIR = os.path.join(DATA_DIR, "predicting_data")
TESTING_DATA_DIR = os.path.join(DATA_DIR, "testing_data")

# Model directories
MODELS_DIR = os.path.join(BASE_DIR, "models")
ML_MODELS_DIR = os.path.join(MODELS_DIR, "SK/machine_learning")
DL_MODELS_DIR = os.path.join(MODELS_DIR, "SK/deep_learning")
OTHER_MODELS_DIR = os.path.join(MODELS_DIR, "SK/other")

# Python modules and plots directories
PYTHON_MODULES_DIR = os.path.join(BASE_DIR, "python_modules")
PLOTS_DIR = os.path.join(BASE_DIR, "plots")

# List of directories to create
directories = [
    RAW_DATA_DIR, PROCESSED_DATA_DIR, PREDICTED_DATA_DIR,
    MODELS_DIR, ML_MODELS_DIR, DL_MODELS_DIR, OTHER_MODELS_DIR,
    PYTHON_MODULES_DIR, PLOTS_DIR
]

# Create directories if they don't exist
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

In [None]:
TARGET_COLUMNS = ['011ec_lot', '012ec_ouvrage', '013ec_localisation', '014ec_mode_constructif']


In [None]:
# import os
# import pandas as pd
# import numpy as np
# import joblib
# import tensorflow as tf
# from tensorflow import keras
# from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.metrics import accuracy_score, classification_report

In [None]:
import os
import pandas as pd
from tqdm import tqdm

def load_test_data(file_path):
    """Load test data from Excel file"""
    try:
        xls = pd.ExcelFile(file_path)
        return {sheet: pd.read_excel(xls, sheet_name=sheet) for sheet in xls.sheet_names}
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return {}

def get_test_files():
    """Get list of test files"""
    return [f for f in os.listdir(TESTING_DATA_DIR) if f.endswith('.xlsx')]

In [None]:
import pandas as pd

def clean_test_data(df):
    """Clean test data with the same logic as training"""
    df = df.copy()
    # Your cleaning logic here (same as training)
    # Example:
    df = df.drop_duplicates()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna('missing')
        else:
            df[col] = df[col].fillna(0)
    return df

def prepare_features_targets(df):
    """Separate features and targets"""
    try:
        X = df.drop(columns=TARGET_COLUMNS + ['Id'], errors='ignore')
        y = df[TARGET_COLUMNS] if all(col in df.columns for col in TARGET_COLUMNS) else None
        return X, y
    except KeyError as e:
        print(f"Missing columns: {e}")
        return None, None

In [None]:
import os
import joblib
import tensorflow as tf
from tensorflow import keras

def load_ml_model(model_path):
    """Load ML model with error handling"""
    try:
        return joblib.load(model_path)
    except Exception as e:
        print(f"Error loading ML model {model_path}: {str(e)}")
        return None

def load_dl_model(model_path):
    """Load DL model with error handling"""
    try:
        return keras.models.load_model(model_path)
    except Exception as e:
        print(f"Error loading DL model {model_path}: {str(e)}")
        return None

def load_all_models():
    """Load all models with their dependencies"""
    models_info = []

    # Load ML models
    for model_file in os.listdir(ML_MODELS_DIR):
        if model_file.endswith(('.joblib', '.pkl')):
            model_path = os.path.join(ML_MODELS_DIR, model_file)
            model_name = os.path.splitext(model_file)[0]

            model = load_ml_model(model_path)
            if model is None:
                continue

            # Try to load preprocessor
            preprocessor_path = os.path.join(ML_MODELS_DIR, f"{model_name}_preprocessor.joblib")
            preprocessor = load_ml_model(preprocessor_path) if os.path.exists(preprocessor_path) else None

            models_info.append({
                'name': model_name,
                'type': 'ML',
                'model': model,
                'preprocessor': preprocessor
            })

    # Load DL models
    for model_dir in os.listdir(DL_MODELS_DIR):
        model_path = os.path.join(DL_MODELS_DIR, model_dir)
        if os.path.isdir(model_path):
            model = load_dl_model(model_path)
            if model is None:
                continue

            # Try to load preprocessor and label encoders
            preprocessor_path = os.path.join(model_path, 'preprocessor.joblib')
            preprocessor = load_ml_model(preprocessor_path) if os.path.exists(preprocessor_path) else None

            label_encoders_path = os.path.join(model_path, 'label_encoders.joblib')
            label_encoders = load_ml_model(label_encoders_path) if os.path.exists(label_encoders_path) else None

            models_info.append({
                'name': model_dir,
                'type': 'DL',
                'model': model,
                'preprocessor': preprocessor,
                'label_encoders': label_encoders
            })

    return models_info

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error

def predict_ml(model, preprocessor, X_test, label_encoders=None):
    """Make predictions with ML model"""
    try:
        if preprocessor:
            X_processed = preprocessor.transform(X_test)
        else:
            X_processed = X_test

        if hasattr(model, 'predict_proba'):
            preds = model.predict_proba(X_processed)
            if preds[0].ndim == 1:  # Binary classification
                return preds[:, 1]
            else:  # Multiclass
                return np.argmax(preds, axis=1)
        else:
            return model.predict(X_processed)
    except Exception as e:
        print(f"ML prediction error: {str(e)}")
        return None

def predict_dl(model, preprocessor, X_test, label_encoders=None):
    """Make predictions with DL model"""
    try:
        if preprocessor:
            X_processed = preprocessor.transform(X_test)
        else:
            X_processed = X_test

        preds = model.predict(X_processed)

        # Handle different output types
        if isinstance(preds, list):  # Multiple outputs
            results = {}
            for i, target in enumerate(TARGET_COLUMNS[:len(preds)]):
                if label_encoders and target in label_encoders:
                    results[target] = label_encoders[target].inverse_transform(np.argmax(preds[i], axis=1))
                else:
                    results[target] = np.argmax(preds[i], axis=1)
            return results
        else:  # Single output
            if preds.ndim > 1 and preds.shape[1] > 1:  # Classification
                if label_encoders and TARGET_COLUMNS[0] in label_encoders:
                    return label_encoders[TARGET_COLUMNS[0]].inverse_transform(np.argmax(preds, axis=1))
                else:
                    return np.argmax(preds, axis=1)
            else:  # Regression
                return preds.flatten()
    except Exception as e:
        print(f"DL prediction error: {str(e)}")
        return None

def evaluate_predictions(y_true, y_pred, target_name):
    """Evaluate prediction quality"""
    try:
        if y_true.dtype == 'object' or len(np.unique(y_true)) < 20:  # Classification
            return accuracy_score(y_true, y_pred)
        else:  # Regression
            return mean_squared_error(y_true, y_pred)
    except Exception as e:
        print(f"Evaluation error for {target_name}: {str(e)}")
        return None

In [None]:
import os
import json
import pandas as pd
import numpy as np

def save_predictions(original_df, predictions, model_name, sheet_name, file_name):
    """Save predictions to Excel with original data"""
    try:
        result_df = original_df.copy()

        # Add predictions
        if isinstance(predictions, dict):  # Multiple targets
            for target, pred in predictions.items():
                result_df[f'Predicted_{target}'] = pred
        else:  # Single target
            result_df[f'Predicted_{TARGET_COLUMNS[0]}'] = predictions

        # Save to Excel
        output_dir = os.path.join(PREDICTED_DATA_DIR, os.path.splitext(file_name)[0])
        os.makedirs(output_dir, exist_ok=True)

        output_path = os.path.join(output_dir, f"{sheet_name}_{model_name}.xlsx")
        result_df.to_excel(output_path, index=False)

        return output_path
    except Exception as e:
        print(f"Error saving predictions: {str(e)}")
        return None

def generate_summary_report(all_results, output_file='model_performance_summary.json'):
    """Generate a summary report of model performance"""
    summary = {}

    for file_name, file_results in all_results.items():
        file_summary = {}

        for sheet_name, sheet_results in file_results.items():
            sheet_summary = {}

            for model_name, model_results in sheet_results.items():
                if 'evaluation' in model_results and model_results['evaluation']:
                    sheet_summary[model_name] = {
                        'metrics': model_results['evaluation'],
                        'output_path': model_results.get('output_path', '')
                    }
                elif 'error' in model_results:
                    sheet_summary[model_name] = {
                        'error': model_results['error']
                    }

            if sheet_summary:
                file_summary[sheet_name] = sheet_summary

        if file_summary:
            summary[file_name] = file_summary

    # Save summary
    with open(os.path.join(PREDICTED_DATA_DIR, output_file), 'w') as f:
        json.dump(summary, f, indent=2)

    return summary

def print_performance_summary(summary):
    """Print performance summary to console"""
    print("\nTop performing models:")
    for file_name, file_summary in summary.items():
        print(f"\nFile: {file_name}")
        for sheet_name, sheet_summary in file_summary.items():
            print(f"\n  Sheet: {sheet_name}")
            for model_name, model_info in sheet_summary.items():
                if 'metrics' in model_info:
                    avg_score = np.mean(list(model_info['metrics'].values()))
                    print(f"    {model_name}: Average score = {avg_score:.4f}")

In [None]:
import os
from tqdm import tqdm

def process_test_file(file_path, models_info):
    """Process a single test file through all models"""
    results = {}
    test_data = load_test_data(file_path)

    for sheet_name, df in test_data.items():
        sheet_results = {}
        df_clean = clean_test_data(df)
        X_test, y_test = prepare_features_targets(df_clean)

        if X_test is None:
            continue

        for model_info in models_info:
            model_name = model_info['name']
            model_type = model_info['type']
            predictions = None

            try:
                if model_type == 'ML':
                    predictions = predict_ml(
                        model_info['model'],
                        model_info.get('preprocessor'),
                        X_test,
                        model_info.get('label_encoders')
                    )
                elif model_type == 'DL':
                    predictions = predict_dl(
                        model_info['model'],
                        model_info.get('preprocessor'),
                        X_test,
                        model_info.get('label_encoders')
                    )

                # Evaluate if we have ground truth
                evaluation = {}
                if y_test is not None and predictions is not None:
                    if isinstance(predictions, dict):  # Multiple targets
                        for target, pred in predictions.items():
                            evaluation[target] = evaluate_predictions(y_test[target], pred, target)
                    else:  # Single target
                        evaluation[TARGET_COLUMNS[0]] = evaluate_predictions(
                            y_test[TARGET_COLUMNS[0]], predictions, TARGET_COLUMNS[0])

                # Save predictions
                output_path = save_predictions(
                    df_clean,
                    predictions,
                    model_name,
                    sheet_name,
                    os.path.basename(file_path)
                )

                sheet_results[model_name] = {
                    'predictions': predictions,
                    'evaluation': evaluation,
                    'output_path': output_path
                }

            except Exception as e:
                print(f"Error processing {model_name} on {sheet_name}: {str(e)}")
                sheet_results[model_name] = {
                    'error': str(e)
                }

        results[sheet_name] = sheet_results

    return results

def main():
    # Create output directory
    os.makedirs(PREDICTED_DATA_DIR, exist_ok=True)

    # Load all models
    models_info = load_all_models()
    print(f"Loaded {len(models_info)} models")

    # Process all test files
    test_files = get_test_files()
    all_results = {}

    for test_file in tqdm(test_files, desc="Processing test files"):
        file_path = os.path.join(TESTING_DATA_DIR, test_file)
        results = process_test_file(file_path, models_info)
        all_results[test_file] = results

    # Generate and display summary
    summary = generate_summary_report(all_results)
    print("\nModel performance summary saved to PREDICTED_DATA/model_performance_summary.json")
    print_performance_summary(summary)

if __name__ == "__main__":
    main()