### Testing Raw Cible WB

## Part 1: Setup and Data Loading


In [31]:
### ====================
### Importing libraries
### ====================
# %matplotlib inline
# %pip install openpyxl
import openpyxl
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# import shap
import os
import warnings

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
### ====================
### Set up visualization and warnings
### ====================
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)
sns.set_palette('viridis')


In [32]:
### ====================
### File paths
###  ===================
# Create directories for model saving
models_dir = '../../models'
for model_type in ['simple_models', 'ml_models', 'dl_models']:
    models_dir = os.path.join(models_dir, model_type)
    os.makedirs(models_dir, exist_ok=True)

# Load Excel file
maquettes_path = "../../data/raw/"
maquettes= ["RawData-Cibles.xlsx"]
for maquette in maquettes:
    maquettes_path = os.path.join(maquettes_path, maquette)
sheets = ["Mur", "Sols", "Poutre", "Poteaux"]  # Adjusted based on your description


In [33]:
### ====================
### import data from the Excel file
### ====================
try:
    murs_df = pd.read_excel(maquettes_path, sheet_name='Murs')
    sols_df = pd.read_excel(maquettes_path, sheet_name='Sols')
    poutres_df = pd.read_excel(maquettes_path, sheet_name='Poutres')
    poteaux_df = pd.read_excel(maquettes_path, sheet_name='Poteaux')
    print("Data loaded successfully from the Excel file.")
except Exception as e:
    print(f"Error loading data: {e}")
    # Handle missing sheets
    available_sheets = pd.ExcelFile(maquettes_path).sheet_names
    print(f"Available sheets: {available_sheets}")
    # Try to load available sheets
    dfs = {}
    for sheet in sheets:
        if sheet in available_sheets:
            dfs[sheet] = pd.read_excel(maquettes_path, sheet_name=sheet)
        else:
            print(f"Sheet '{sheet}' not found in the Excel file.")
    mur_df = dfs.get('Murs', pd.DataFrame())
    sol_df = dfs.get('Sols', pd.DataFrame())
    poutres_df = dfs.get('Poutres', pd.DataFrame())
    poteaux_df = dfs.get('Poteaux', pd.DataFrame())

# Display basic info about each dataframe
print("Murs DataFrame Shape:", murs_df.shape)
print("Sols DataFrame Shape:", sols_df.shape)
print("Poutres DataFrame Shape:", poutres_df.shape)
print("Poteaux DataFrame Shape:", poteaux_df.shape)

# Display the first few rows of each dataframe
print("\nMurs DataFrame Preview:")
print(murs_df.columns)
print("\nSols DataFrame Preview:")
print(sols_df.columns)
print("\nPoutres DataFrame Preview:")
print(poutres_df.columns)
print("\nPoteaux DataFrame Preview:")
print(poteaux_df.columns)

Data loaded successfully from the Excel file.
Murs DataFrame Shape: (312, 96)
Sols DataFrame Shape: (107, 94)
Poutres DataFrame Shape: (246, 100)
Poteaux DataFrame Shape: (68, 87)

Murs DataFrame Preview:
Index(['Id', '011EC_Lot', '012EC_Ouvrage', '013EC_Localisation',
       '014EC_Mode Constructif', 'Nom', 'Hauteur', 'Epaisseur', 'AI', 'AS',
       'Sols en intersection', 'Sols coupés (u)', 'Sols coupés (Ids)',
       'Sols coupants (u)', 'Sols coupants (Ids)', 'Sol au-dessus',
       'Sol en-dessous', 'Fenêtres', 'Portes', 'Ouvertures', 'Murs imbriqués',
       'Mur multicouche', 'Mur empilé', 'Profil modifié', 'Image', 'Catégorie',
       'Section', 'Type prédéfini d'IFC', 'Exporter au format IFC sous',
       'Exporter au format IFC', 'IfcGUID', 'A une association',
       'Enrobage d'armature - Autres faces',
       'Enrobage d'armature - Face intérieure',
       'Enrobage d'armature - Face extérieure', 'Variantes',
       'Extension inférieure', 'Extension supérieure', 'Volume',

## Part 2: Data Preprocessing and Relationship Mapping


In [None]:
### ====================
### Define essential columns for each DataFrame
### ====================
ESSENTIAL_COLUMNS = {
    "Murs": [
        "Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "Hauteur", "Epaisseur", "AI", "AS", "Sols en intersection", "Sols coupés (u)", "Sols coupés (Ids)",
        "Sols coupants (u)", "Sols coupants (Ids)", "Sol au-dessus", "Sol en-dessous", "Fenêtres", "Portes", "Ouvertures", "Murs imbriqués",
        "Mur multicouche", "Profil modifié", "Extension inférieure", "Extension supérieure", "Volume", "Surface", "Partie inférieure attachée", "Partie supérieure attachée",
        "Décalage supérieur", "Décalage inférieur", "Matériau structurel",
    ],
    "Sols": [
        "Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "Murs en intersection",
        "Murs coupés (u)", "Murs coupés (Ids)", "Murs coupants (u)", "Murs coupants (Ids)", "Poutres en intersection", "Poutres coupés (u)",
        "Poutres coupés (Ids)", "Poutres coupants (u)", "Poutres coupants (Ids)", "Poteaux en intersection",
        "Poteaux coupés (u)", "Poteaux coupés (Ids)", "Poteaux coupants (u)", "Poteaux coupants (Ids)", "Volume", "Surface", "Matériau structurel",
    ],
    "Poutres": [
        "Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "AI", "AS", "Hauteur totale", "Hauteur", "Sols en intersection", "Sols coupés (u)",
        "Sols coupés (Ids)", "Sols coupants (u)", "Sols coupants (Ids)", "Sol au-dessus", "Sol en-dessous", "Poteaux en intersection",
        "Poteaux coupés (u)", "Poteaux coupés (Ids)", "Poteaux coupants (u)", "Matériau structurel",
        "Poteaux coupants (Ids)", "Elévation à la base", "Longueur de coupe",
    ],
    "Poteaux": [
        "Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "Nom", "AI", "AS", "Hauteur", "Longueur",
        "Partie inférieure attachée", "Partie supérieure attachée", "Sols en intersection", "Sols coupés (u)", "Sols coupés (Ids)",
        "Sols coupants (u)", "Sols coupants (Ids)", "Poutres en intersection", "Poutres coupés (u)", "Poutres coupés (Ids)", "Poutres coupants (u)",
        "Poutres coupants (Ids)", "Matériau structurel", "Marque d'emplacement du poteau", "Décalage supérieur", "Décalage inférieur",
        "Longueur", "Sols coupés (Ids)", "Sols coupants (Ids)", "Poutres coupés (Ids)", "Poutres coupants (Ids)",
    ]
}

In [None]:
### ====================
### Data Loading and Cleaning
### ====================
def load_and_clean_data(filepath):
    """Load and clean data with robust column name handling"""
    dfs = {}

    try:
        xls = pd.ExcelFile(filepath)
        available_sheets = xls.sheet_names

        for sheet, keep_cols in ESSENTIAL_COLUMNS.items():
            if sheet in available_sheets:
                # Load and clean
                df = pd.read_excel(filepath, sheet_name=sheet)
                df.columns = (df.columns
                            .str.strip()
                            .str.replace('\s+', ' ', regex=True)
                            )

                # Select columns
                existing_cols = [col.strip() for col in keep_cols if col.strip() in df.columns]
                missing_cols = set(col.strip() for col in keep_cols) - set(existing_cols)

                if missing_cols:
                    print(f"⚠️ {sheet}: Missing {len(missing_cols)} columns: {list(missing_cols)[:3]}{'...' if len(missing_cols)>3 else ''}")

                dfs[sheet] = df[existing_cols]
                print(f"✅ {sheet}: Kept {len(existing_cols)}/{len(keep_cols)} columns | New shape: {dfs[sheet].shape}")
            else:
                print(f"⚠️ Sheet '{sheet}' not found")
                dfs[sheet] = pd.DataFrame()

    except Exception as e:
        print(f"🚨 Error: {str(e)[:100]}...")
        dfs = {sheet: pd.DataFrame() for sheet in ESSENTIAL_COLUMNS.keys()}

    return dfs

In [36]:
### ====================
### Verify that all critical columns are present
### ====================
CRITICAL_COLUMNS = ["011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif"]

for sheet, df in dataframes.items():
    missing_critical = [col for col in CRITICAL_COLUMNS if col not in df.columns]
    if missing_critical:
        print(f"🚨 Critical columns missing in {sheet}: {missing_critical}")

In [37]:
### ====================
### Cleaned DataFrames Loading
### ====================
dataframes = load_and_clean_data(maquettes_path)

# Access the cleaned DataFrames
murs_df = dataframes['Murs']
sols_df = dataframes['Sols']
poutre_df = dataframes['Poutres']
poteaux_df = dataframes['Poteaux']

# Verify the results
print("\nCleaned DataFrame Shapes:")
for name, df in dataframes.items():
    print(f"{name}: {df.shape}")

✅ Murs: Kept 31/31 columns | New shape: (312, 31)
✅ Sols: Kept 24/24 columns | New shape: (107, 24)
✅ Poutres: Kept 24/24 columns | New shape: (246, 24)
✅ Poteaux: Kept 31/31 columns | New shape: (68, 31)

Cleaned DataFrame Shapes:
Murs: (312, 31)
Sols: (107, 24)
Poutres: (246, 24)
Poteaux: (68, 31)


In [None]:
def clean_id_columns(df_dict):
    """
    Clean ID columns across all DataFrames by setting IDs to 0 when:
    - The corresponding (u) column = 0 AND
    - The (Ids) column is empty/NaN.
    """

    COLUMN_PAIRS = [
        ('Sols coupés (u)', 'Sols coupés (Ids)'),
        ('Sols coupants (u)', 'Sols coupants (Ids)'),
        ('Murs coupés (u)', 'Murs coupés (Ids)'),
        ('Murs coupants (u)', 'Murs coupants (Ids)'),
        ('Poutres coupés (u)', 'Poutres coupés (Ids)'),
        ('Poutres coupants (u)', 'Poutres coupants (Ids)'),
        ('Poteaux coupés (u)', 'Poteaux coupés (Ids)'),
        ('Poteaux coupants (u)', 'Poteaux coupants (Ids)')
    ]

    for df_name, df in df_dict.items():
        if not isinstance(df, pd.DataFrame) or df.empty:
            print(f"⚠️ {df_name}: Empty or not a DataFrame")
            continue

        print(f"\n{'='*50}\n🔍 Processing {df_name}\n{'='*50}")

        for u_col, ids_col in COLUMN_PAIRS:
            if u_col not in df.columns or ids_col not in df.columns:
                print(f"🚨 Skipping: {u_col} or {ids_col} not found in {df_name}")
                continue

            # Convert `Ids` to string safely
            df[ids_col] = df[ids_col].astype(str).replace(['nan', 'na', 'none', '', ' '], None)

            # Apply cleaning condition
            condition = (df[u_col] == 0) & (df[ids_col].isna())
            df.loc[condition, ids_col] = "0"  # Set IDs to "0" when u = 0 and Ids is empty

            # Show before/after for a few sample rows
            print(f"\n🔹 Column Pair: {u_col} → {ids_col}")
            print("BEFORE:", df.loc[condition.head(3).index, [u_col, ids_col]])
            print("AFTER:", df.loc[condition.head(3).index, [u_col, ids_col]])

    return df_dict

# Usage Example
df_dict = {
    'Murs': murs_df,
    'Sols': sols_df,
    'Poutres': poutres_df,
    'Poteaux': poteaux_df
}

cleaned_dfs = clean_id_columns(df_dict)

# Update DataFrames
murs_df = cleaned_dfs['Murs']
sols_df = cleaned_dfs['Sols']
poutres_df = cleaned_dfs['Poutres']
poteaux_df = cleaned_dfs['Poteaux']


🔍 Processing Murs

🔹 Column Pair: Sols coupés (u) → Sols coupés (Ids)
BEFORE:    Sols coupés (u) Sols coupés (Ids)
0                0                []
1                0                []
2                0                []
AFTER:    Sols coupés (u) Sols coupés (Ids)
0                0                []
1                0                []
2                0                []

🔹 Column Pair: Sols coupants (u) → Sols coupants (Ids)
BEFORE:    Sols coupants (u)          Sols coupants (Ids)
0                  3  [1788267, 1788458, 1790282]
1                  1                    [1788267]
2                  1                    [1788267]
AFTER:    Sols coupants (u)          Sols coupants (Ids)
0                  3  [1788267, 1788458, 1790282]
1                  1                    [1788267]
2                  1                    [1788267]
🚨 Skipping: Murs coupés (u) or Murs coupés (Ids) not found in Murs
🚨 Skipping: Murs coupants (u) or Murs coupants (Ids) not found in Murs
🚨 Skippin

TypeError: '<' not supported between instances of 'str' and 'int'

In [None]:
print(poutres_df.head())
print(poutres_df.dtypes)

        Id 011EC_Lot 012EC_Ouvrage 013EC_Localisation 014EC_Mode Constructif  \
0  1632051       NaN           NaN                NaN                    NaN   
1  1666165       NaN           NaN                NaN                    NaN   
2  1702917       NaN           NaN                NaN                    NaN   
3  1788246        GO        POUTRE          INTERIEUR         PREFA CHANTIER   
4  1788248        GO        POUTRE          EXTERIEUR         PREFA CHANTIER   

           Nom    AI    AS  Hauteur totale       Hauteur  \
0  Po(40x60ht) -4.32 -4.32    1.624301e-15  1.776357e-15   
1  Po(40x60ht) -4.32 -4.32    1.624301e-15  1.776357e-15   
2  Po(40x60ht) -4.32 -4.32    1.624301e-15  1.776357e-15   
3  Po(20x60ht) -0.30  0.30    6.000000e-01  6.000000e-01   
4  Po(20x60ht) -0.30  0.30    6.000000e-01  6.000000e-01   

   Sols en intersection  Sols coupés (u) Sols coupés (Ids)  Sols coupants (u)  \
0                     0                0                []                  0

In [None]:
def create_relationship_features(main_df, related_df, relation_config, prefix):
    """
    Robust relationship feature creation with:
    - Better column name handling
    - NaN/empty value protection
    - Clear validation
    """
    for relation_col, feature_cols in relation_config.items():
        # 1. Find matching column (case insensitive, handles typos)
        matching_cols = [col for col in main_df.columns
                        if relation_col.lower() in col.lower()]

        if not matching_cols:
            print(f"⚠️ No column matching '{relation_col}' found in DataFrame")
            continue

        actual_col = matching_cols[0]
        print(f"🔧 Processing {actual_col} (matched from {relation_col})")

        # 2. Clean and explode relationship IDs
        try:
            # Convert to string and clean
            main_df[actual_col] = main_df[actual_col].astype(str)
            main_df[actual_col] = (main_df[actual_col]
                                  .str.replace(r'[\[\]]', '', regex=True)
                                  .replace(['nan', 'None', 'NaN', ''], '0'))

            # Explode and convert to integers
            exploded = main_df[[actual_col]].explode(actual_col)
            exploded[actual_col] = pd.to_numeric(exploded[actual_col], errors='coerce')
            exploded = exploded.dropna()

            if exploded.empty:
                print(f"⚠️ No valid relationships in {actual_col}")
                continue

            # 3. Merge with related features
            for feature in feature_cols:
                if feature not in related_df.columns:
                    print(f"⚠️ Feature '{feature}' not in related DataFrame")
                    continue

                # Perform the merge
                merged = exploded.merge(
                    related_df[[feature]],
                    left_on=actual_col,
                    right_index=True,
                    how='left'
                )

                # Aggregate back to original
                new_col = f"{prefix}_{feature}"
                if np.issubdtype(merged[feature].dtype, np.number):
                    main_df[new_col] = merged.groupby(merged.index)[feature].mean()
                else:
                    main_df[new_col] = merged.groupby(merged.index)[feature].agg(
                        lambda x: x.mode()[0] if not x.empty else np.nan
                    )

                print(f"✅ Created {new_col}")

        except Exception as e:
            print(f"❌ Error processing {actual_col}: {str(e)}")
            continue

    return main_df

In [None]:
# Define your relationship configs (corrected for typos)
mur_relations = {
    'Sols coupés (Ids)': ['Hauteur', 'Epaisseur', 'Volume', 'Surface'],
    'Sols coupants (Ids)': ['Hauteur', 'Epaisseur', 'Volume', 'Surface']
}

# Process with corrected function
print("\nProcessing Murs relationships:")
murs_df = create_relationship_features(murs_df, sols_df, mur_relations, 'sol')

# Verify
print("\nCreated features in Murs:")
print([col for col in murs_df.columns if col.startswith('sol_')])


Processing Murs relationships:
🔧 Processing Sols coupés (Ids) (matched from Sols coupés (Ids))
⚠️ Feature 'Hauteur' not in related DataFrame
✅ Created sol_Epaisseur
✅ Created sol_Volume
✅ Created sol_Surface
🔧 Processing Sols coupants (Ids) (matched from Sols coupants (Ids))
⚠️ Feature 'Hauteur' not in related DataFrame
✅ Created sol_Epaisseur
✅ Created sol_Volume
✅ Created sol_Surface

Created features in Murs:
['sol_Epaisseur', 'sol_Volume', 'sol_Surface']


In [10]:
def full_validation():
    """Run complete validation suite with robust checks"""
    dfs = {
        'Murs': murs_df,
        'Sols': sols_df,
        'Poutres': poutres_df,
        'Poteaux': poteaux_df
    }

    # 1. Basic DataFrame verification
    print("="*50 + "\nBasic DataFrame Verification\n" + "="*50)
    for name, df in dfs.items():
        if not isinstance(df, pd.DataFrame):
            print(f"\n❌ {name}: Not a DataFrame")
            continue

        print(f"\n🔍 {name} DataFrame:")
        print(f"Shape: {df.shape}")
        print("Columns:", df.columns.tolist())

        # Check relationship columns
        rel_cols = [c for c in df.columns if 'coup' in c.lower()]
        print(f"\nRelationship columns ({len(rel_cols)}):")
        print(rel_cols)

        # Check created features
        created_features = [c for c in df.columns if any(x in c for x in ['sol_', 'mur_', 'poutre_', 'poteau_'])]
        print(f"\nCreated features ({len(created_features)}):")
        if created_features:
            print(df[created_features].head(2))
        else:
            print("No relationship features created")

    # 2. Detailed relationship validation
    print("\n" + "="*50 + "\nDetailed Relationship Validation\n" + "="*50)

    def safe_validate(main_df, related_df, relation_col, prefix):
        """Validate relationships with error handling"""
        try:
            if relation_col not in main_df.columns:
                print(f"❌ Missing relation column: {relation_col}")
                return

            created_cols = [f"{prefix}_{f}" for f in ['Hauteur', 'Epaisseur', 'Volume', 'Surface']
                          if f"{prefix}_{f}" in main_df.columns]

            if not created_cols:
                print(f"⚠️ No features created for {relation_col}")
                return

            print(f"\n✅ Validating {relation_col}:")
            print(f"Created {len(created_cols)} features")
            print("Sample values:")
            print(main_df[created_cols].head(2))

        except Exception as e:
            print(f"❌ Validation failed for {relation_col}: {str(e)}")

    # Validate each relationship type
    if isinstance(murs_df, pd.DataFrame):
        safe_validate(murs_df, sols_df, 'Sols coupés (Ids)', 'sol')
        safe_validate(murs_df, sols_df, 'Sols coupants (Ids)', 'sol')

    if isinstance(sols_df, pd.DataFrame):
        safe_validate(sols_df, murs_df, 'Murs coupés (Ids)', 'murs')
        safe_validate(sols_df, poutres_df, 'Poutres coupés (Ids)', 'poutres')

    if isinstance(poutres_df, pd.DataFrame):
        safe_validate(poutres_df, sols_df, 'Sols coupés (Ids)', 'sol')
        safe_validate(poutres_df, poteaux_df, 'Poteaux coupés (Ids)', 'poteaux')

    if isinstance(poteaux_df, pd.DataFrame):
        safe_validate(poteaux_df, sols_df, 'Sols coupés (Ids)', 'sol')
        safe_validate(poteaux_df, poutres_df, 'Poutres coupés (Ids)', 'poutres')

In [12]:
full_validation()

Basic DataFrame Verification

🔍 Murs DataFrame:
Shape: (312, 96)
Columns: ['Id', '011EC_Lot', '012EC_Ouvrage', '013EC_Localisation', '014EC_Mode Constructif', 'Nom', 'Hauteur', 'Epaisseur', 'AI', 'AS', 'Sols en intersection', 'Sols coupés (u)', 'Sols coupés (Ids)', 'Sols coupants (u)', 'Sols coupants (Ids)', 'Sol au-dessus', 'Sol en-dessous', 'Fenêtres', 'Portes', 'Ouvertures', 'Murs imbriqués', 'Mur multicouche', 'Mur empilé', 'Profil modifié', 'Image', 'Catégorie', 'Section', "Type prédéfini d'IFC", 'Exporter au format IFC sous', 'Exporter au format IFC', 'IfcGUID', 'A une association', "Enrobage d'armature - Autres faces", "Enrobage d'armature - Face intérieure", "Enrobage d'armature - Face extérieure", 'Variantes', 'Extension inférieure', 'Extension supérieure', 'Volume', 'Surface', 'Phase de démolition', 'Phase de création', 'Commentaires', 'Longueur', 'Famille et type', 'Famille', 'Type', 'Nom de la famille', 'Nom du type', 'ID du type', 'Lié au volume', 'Structure', 'Identifiant

## Part 3: Feature Engineering and Target Preparation

In [30]:
# Feature selection for Murs DataFrame
# We'll exclude the target columns and ID columns from features
excluded_features = target_columns + ['Id', 'Sols coupés (Ids)', 'Sols coupants (Ids)']
features = [col for col in mur_df.columns if col not in excluded_features]

# Separate features and targets
X = mur_df[features]
y = mur_df[target_columns]

# Handle categorical features (text with special French characters)
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=np.number).columns

print("Categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)

# Preprocessing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Numeric transformer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# For multi-label classification, we'll use separate models for each target
# Or we can combine them into a single target (less recommended due to different natures)
# Here we'll proceed with separate models

# Get feature names after one-hot encoding
# For numeric features
numeric_feature_names = numeric_cols.tolist()

# For categorical features
if len(categorical_cols) > 0:
    ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
    categorical_feature_names = ohe.get_feature_names_out(categorical_cols).tolist()
    all_feature_names = numeric_feature_names + categorical_feature_names
else:
    all_feature_names = numeric_feature_names

print(f"Total features after preprocessing: {len(all_feature_names)}")

NameError: name 'target_columns' is not defined

## Part 4: Exploratory Data Analysis and Correlation Analysis

In [None]:
# Correlation analysis for each target variable
plt.figure(figsize=(15, 10))

# For numeric features only (correlation requires numeric data)
numeric_df = X[numeric_cols]

# Add targets to the numeric_df for correlation
for target in target_columns:
    if target in mur_df.columns:
        # Encode target for correlation
        le = LabelEncoder()
        encoded_target = le.fit_transform(mur_df[target])
        numeric_df[target] = encoded_target

# Compute correlation matrix
corr_matrix = numeric_df.corr()

# Plot heatmap for each target
for i, target in enumerate(target_columns, 1):
    if target in numeric_df.columns:
        plt.subplot(2, 2, i)
        target_corr = corr_matrix[target].sort_values(ascending=False)
        sns.barplot(x=target_corr.values[1:11], y=target_corr.index[1:11])
        plt.title(f'Top 10 Features Correlated with {target}')
        plt.tight_layout()

plt.show()

# Analyze distribution of target variables
plt.figure(figsize=(15, 10))
for i, target in enumerate(target_columns, 1):
    if target in mur_df.columns:
        plt.subplot(2, 2, i)
        sns.countplot(y=mur_df[target], order=mur_df[target].value_counts().index)
        plt.title(f'Distribution of {target}')
        plt.tight_layout()

plt.show()

# SHAP analysis for feature importance (sample for one target)
if '012EC_Ouvrage' in mur_df.columns:
    # Sample a subset for faster SHAP computation
    X_sample = X_processed[:1000] if X_processed.shape[0] > 1000 else X_processed

    # Train a model for this target
    y_target = mur_df['012EC_Ouvrage']
    le = LabelEncoder()
    y_encoded = le.fit_transform(y_target)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_sample, y_encoded, test_size=0.2, random_state=42)

    # Train a model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Compute SHAP values
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    # Plot summary
    plt.figure()
    shap.summary_plot(shap_values, X_test, feature_names=all_feature_names, class_names=le.classes_)
    plt.title('SHAP Summary for 012EC_Ouvrage Prediction')
    plt.show()

## Part 5: Model Training and Evaluation


In [None]:
# Function to train and evaluate models for a target variable
def train_evaluate_models(X, y, target_name, models):
    """
    Train and evaluate multiple models for a target variable

    Args:
        X: Features (processed)
        y: Target variable
        target_name: Name of the target variable
        models: Dictionary of models to evaluate

    Returns:
        Dictionary of model performances
    """
    # Encode target if categorical
    if y.dtype == 'object':
        le = LabelEncoder()
        y = le.fit_transform(y)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    results = {}

    for model_name, model in models.items():
        print(f"\nTraining {model_name} for {target_name}...")

        try:
            # Train model
            model.fit(X_train, y_train)

            # Predict
            y_pred = model.predict(X_test)

            # Evaluate
            accuracy = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred, output_dict=True)

            # Store results
            results[model_name] = {
                'accuracy': accuracy,
                'precision': report['weighted avg']['precision'],
                'recall': report['weighted avg']['recall'],
                'f1': report['weighted avg']['f1-score']
            }

            print(f"{model_name} Accuracy: {accuracy:.4f}")

            # Save model based on type
            if hasattr(model, 'layers'):  # Keras model
                model_path = f"dlmodels/{target_name}_{model_name}.h5"
                model.save(model_path)
            elif 'boost' in model_name.lower() or 'forest' in model_name.lower():
                model_path = f"mlmodels/{target_name}_{model_name}.pkl"
                import joblib
                joblib.dump(model, model_path)
            else:
                model_path = f"simplemodels/{target_name}_{model_name}.pkl"
                import joblib
                joblib.dump(model, model_path)

        except Exception as e:
            print(f"Error with {model_name}: {e}")
            results[model_name] = None

    return results

# Define models to evaluate
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(random_state=42),
}

# Add a simple neural network
def create_nn_model(input_dim, output_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(optimizer=Adam(0.001),
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    return model

# For each target variable, train and evaluate models
all_results = {}

for target in target_columns:
    if target in mur_df.columns:
        print(f"\n{'='*50}")
        print(f"Training models for target: {target}")
        print(f"{'='*50}")

        y_target = mur_df[target]

        # Skip if all values are the same
        if len(y_target.unique()) == 1:
            print(f"Skipping {target} - only one class present.")
            continue

        # Add neural network to models
        output_dim = len(y_target.unique())
        nn_model = create_nn_model(X_processed.shape[1], output_dim)
        models['NeuralNetwork'] = nn_model

        # Train and evaluate
        results = train_evaluate_models(X_processed, y_target, target, models)
        all_results[target] = results

        # Remove NN for next target (to recreate with correct output dim)
        del models['NeuralNetwork']

        # Plot model comparison
        if results:
            df_results = pd.DataFrame(results).T
            df_results['accuracy'].plot(kind='bar', title=f'Model Accuracy for {target}')
            plt.ylabel('Accuracy')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()

# Display all results
for target, results in all_results.items():
    print(f"\nResults for {target}:")
    if results:
        display(pd.DataFrame(results).T)

## Part 6: Model Interpretation and Deployment

In [None]:
# Function to interpret best model for each target
def interpret_best_model(target, results, X_processed, y_target):
    """
    Interpret the best model for a target using SHAP

    Args:
        target: Target variable name
        results: Dictionary of model results
        X_processed: Processed features
        y_target: Target values
    """
    if not results:
        return

    # Find best model by accuracy
    df_results = pd.DataFrame(results).T
    best_model_name = df_results['accuracy'].idxmax()
    best_model_accuracy = df_results.loc[best_model_name, 'accuracy']

    print(f"\nInterpreting best model for {target}: {best_model_name} (Accuracy: {best_model_accuracy:.4f})")

    # Load the best model
    if 'NeuralNetwork' in best_model_name:
        model_path = f"dlmodels/{target}_{best_model_name}.h5"
        best_model = tf.keras.models.load_model(model_path)

        # For neural networks, we'll use a different explainer
        # Sample data for faster computation
        X_sample = X_processed[:100] if X_processed.shape[0] > 100 else X_processed

        # Create a SHAP explainer
        explainer = shap.DeepExplainer(best_model, X_sample)
        shap_values = explainer.shap_values(X_sample)

        # Plot summary
        plt.figure()
        shap.summary_plot(shap_values, X_sample, feature_names=all_feature_names)
        plt.title(f'SHAP Summary for {target} ({best_model_name})')
        plt.show()

    else:
        if 'boost' in best_model_name.lower() or 'forest' in best_model_name.lower():
            model_path = f"mlmodels/{target}_{best_model_name}.pkl"
        else:
            model_path = f"simplemodels/{target}_{best_model_name}.pkl"

        import joblib
        best_model = joblib.load(model_path)

        # Create SHAP explainer
        X_sample = X_processed[:100] if X_processed.shape[0] > 100 else X_processed

        if hasattr(best_model, 'predict_proba'):
            explainer = shap.TreeExplainer(best_model)
            shap_values = explainer.shap_values(X_sample)

            # Plot summary
            plt.figure()
            shap.summary_plot(shap_values, X_sample, feature_names=all_feature_names)
            plt.title(f'SHAP Summary for {target} ({best_model_name})')
            plt.show()
        else:
            print(f"Cannot create SHAP explainer for {best_model_name}")

# Interpret best models for each target
for target, results in all_results.items():
    y_target = mur_df[target]
    interpret_best_model(target, results, X_processed, y_target)

# Final recommendations
print("\nFinal Recommendations:")
print("1. The best performing models have been saved in their respective folders (simplemodels/, mlmodels/, dlmodels/)")
print("2. SHAP analysis has been provided for model interpretability")
print("3. Consider feature engineering based on the correlation and SHAP analysis")
print("4. For deployment, use the best model for each target variable")
print("5. Monitor model performance over time as new data becomes available")

## Part 7: Learning Curves and Model Analysis

In [None]:
# Function to plot learning curves
def plot_learning_curve(model, X, y, model_name, target_name):
    """
    Plot learning curves for a model

    Args:
        model: The model to evaluate
        X: Features
        y: Target
        model_name: Name of the model
        target_name: Name of the target variable
    """
    from sklearn.model_selection import learning_curve

    # If y is categorical, encode it
    if y.dtype == 'object':
        le = LabelEncoder()
        y = le.fit_transform(y)

    # Create CV training and test scores for various training set sizes
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=5, scoring='accuracy',
        train_sizes=np.linspace(0.1, 1.0, 5))

    # Calculate mean and standard deviation for training set scores
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)

    # Calculate mean and standard deviation for test set scores
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot learning curves
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training score')
    plt.plot(train_sizes, test_mean, 'o-', color='green', label='Cross-validation score')

    # Draw bands
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='green')

    # Create plot
    plt.title(f'Learning Curve for {model_name} ({target_name})')
    plt.xlabel('Training Set Size')
    plt.ylabel('Accuracy Score')
    plt.legend(loc='best')
    plt.grid()
    plt.show()

# Plot learning curves for best models
for target, results in all_results.items():
    if results:
        # Find best model by accuracy
        df_results = pd.DataFrame(results).T
        best_model_name = df_results['accuracy'].idxmax()

        # Load the best model
        if 'NeuralNetwork' in best_model_name:
            model_path = f"dlmodels/{target}_{best_model_name}.h5"
            best_model = tf.keras.models.load_model(model_path)
        elif 'boost' in best_model_name.lower() or 'forest' in best_model_name.lower():
            model_path = f"mlmodels/{target}_{best_model_name}.pkl"
            import joblib
            best_model = joblib.load(model_path)
        else:
            model_path = f"simplemodels/{target}_{best_model_name}.pkl"
            import joblib
            best_model = joblib.load(model_path)

        # Get target data
        y_target = mur_df[target]

        # Plot learning curve
        plot_learning_curve(best_model, X_processed, y_target, best_model_name, target)

Explanation and Next Steps
This comprehensive solution provides:

Data Loading and Preprocessing: Handles the complex relationships between different BIM elements (Murs, Sols, Poutres, Poteaux) and processes the French text data with special characters.

Feature Engineering: Creates relationship features between different BIM elements based on their intersections and cuts.

Exploratory Data Analysis: Includes correlation analysis and target distribution visualization.

Model Training: Evaluates multiple machine learning models (Logistic Regression, Random Forest, SVM, XGBoost, LightGBM) and a neural network for each target variable.

Model Interpretation: Uses SHAP values to explain model predictions and identify important features.

Model Saving: Saves the best models in appropriate folders based on their complexity (simplemodels/, mlmodels/, dlmodels/).

Learning Curves: Visualizes model performance with increasing training data size.

Next Steps:

Deploy the best models for each target variable in your BIM system.

Set up monitoring to track model performance over time.

Consider implementing an ensemble approach if prediction accuracy needs improvement.

Explore more sophisticated deep learning architectures if you have sufficient data.

Regularly update the models with new project data to maintain accuracy.

