# Bim_Predict NoteBook

## Importing Libraries

In [1]:
# Import libraries
import os
import pandas as pd

# Define project folder paths
# Data directories
BASE_DIR = "../../"
DATA_DIR = os.path.join(BASE_DIR, "data")
RAW_DATA_DIR = os.path.join(DATA_DIR, "raw_data")
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, "processed_data")
PREDICTED_DATA_DIR = os.path.join(DATA_DIR, "predicting_data")
TESTING_DATA_DIR = os.path.join(DATA_DIR, "testing_data")

# Model directories
MODELS_DIR = os.path.join(BASE_DIR, "models")
ML_MODELS_DIR = os.path.join(MODELS_DIR, "SK/machine_learning")
DL_MODELS_DIR = os.path.join(MODELS_DIR, "SK/deep_learning")
OTHER_MODELS_DIR = os.path.join(MODELS_DIR, "SK/other")

# Python modules and plots directories
PYTHON_MODULES_DIR = os.path.join(BASE_DIR, "python_modules")
PLOTS_DIR = os.path.join(BASE_DIR, "plots")

# List of directories to create
directories = [
    RAW_DATA_DIR, PROCESSED_DATA_DIR, PREDICTED_DATA_DIR,
    MODELS_DIR, ML_MODELS_DIR, DL_MODELS_DIR, OTHER_MODELS_DIR,
    PYTHON_MODULES_DIR, PLOTS_DIR
]

# Create directories if they don't exist
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

Directory already exists: ../../data/raw_data
Directory already exists: ../../data/processed_data
Directory already exists: ../../data/predicting_data
Directory already exists: ../../models
Directory already exists: ../../models/SK/machine_learning
Directory already exists: ../../models/SK/deep_learning
Directory already exists: ../../models/SK/other
Directory already exists: ../../python_modules
Directory already exists: ../../plots


<!-- ### Paths Creating && Data Importing -->

In [2]:
import os
import pandas as pd

# List all Excel files in RAW_DATA_DIR
excel_files = [f for f in os.listdir(RAW_DATA_DIR) if f.endswith(".xlsx") or f.endswith(".xls")]

# Dictionary to store DataFrames for each file and sheet
dataframes = {}

# Process each Excel file
for file in excel_files:
    file_path = os.path.join(RAW_DATA_DIR, file)
    print(f"Loading: {file_path}")

    try:
        # Load Excel file
        excel_data = pd.ExcelFile(file_path)

        # Load all sheets dynamically
        for sheet_name in excel_data.sheet_names:
            df = excel_data.parse(sheet_name)

            # Save DataFrame with a unique identifier
            dataframes[f"{file}_{sheet_name}"] = df

    except Exception as e:
        print(f"Error loading {file_path}: {e}")

# Display summary of loaded data
print(f"\nTotal files processed: {len(dataframes)}")
for key, df in dataframes.items():
    print(f"Loaded DataFrame: {key}, Shape: {df.shape}")

Loading: ../../data/raw_data/maquette_23017.xlsx
Loading: ../../data/raw_data/maquette_23016.xlsx
Loading: ../../data/raw_data/maquette_23002.xlsx
Loading: ../../data/raw_data/maquette_23007.xlsx
Loading: ../../data/raw_data/RawData-Cibles.xlsx
Loading: ../../data/raw_data/maquette_23001.xlsx

Total files processed: 23
Loaded DataFrame: maquette_23017.xlsx_Murs, Shape: (215, 149)
Loaded DataFrame: maquette_23017.xlsx_Sols, Shape: (29, 140)
Loaded DataFrame: maquette_23017.xlsx_Poutres, Shape: (152, 136)
Loaded DataFrame: maquette_23017.xlsx_Poteaux, Shape: (72, 111)
Loaded DataFrame: maquette_23016.xlsx_Murs, Shape: (1589, 146)
Loaded DataFrame: maquette_23016.xlsx_Sols, Shape: (45, 142)
Loaded DataFrame: maquette_23016.xlsx_Poutres, Shape: (778, 136)
Loaded DataFrame: maquette_23016.xlsx_Poteaux, Shape: (215, 110)
Loaded DataFrame: maquette_23002.xlsx_Murs, Shape: (345, 94)
Loaded DataFrame: maquette_23002.xlsx_Sols, Shape: (32, 91)
Loaded DataFrame: maquette_23002.xlsx_Poutres, Shape

<!-- ### Data Cleaning && PreProcessing -->

## PreProcessing Data

In [3]:
# # Define required columns dynamically
# required_columns = {
#     "Murs": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "Hauteur",
#              "Epaisseur", "AI", "AS", "Sols en intersection", "Sols coupés (u)", "Sols coupés (Ids)",
#              "Sols coupants (u)", "Sols coupants (Ids)", "Sol au-dessus", "Sol au-dessous", "Fenêtres", "Portes",
#              "Ouvertures", "Murs imbriqués", "Mur multicouche", "Mur empilé", "Profil modifié", "Extension inférieure",
#              "Extension supérieure", "Partie inférieure attachée", "Partie supérieure attachée", "Décalage supérieur",
#              "Décalage inférieur", "Matériau structurel"],

#     "Sols": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "Murs en intersection",
#              "Murs coupés (u)", "Murs coupés (Ids)", "Murs coupants (u)", "Murs coupants (Ids)", "Poutres en intersection",
#              "Poutres coupés (u)", "Poutres coupés (Ids)", "Poutres coupants (u)", "Poutres coupants (Ids)",
#              "Poteaux en intersection", "Poteaux coupés (u)", "Poteaux coupés (Ids)", "Poteaux coupants (u)",
#              "Poteaux coupants (Ids)", "Ouvertures", "Sol multicouche", "Profil modifié", "Décalage par rapport au niveau",
#              "Epaisseur", "Lié au volume", "Etude de l'élévation à la base", "Etude de l'élévation en haut",
#              "Epaisseur du porteur", "Elévation au niveau du noyau inférieur", "Elévation au niveau du noyau supérieur",
#              "Elévation en haut", "Elévation à la base", "Matériau structurel"],

#     "Poutres": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "AI", "AS",
#                 "Hauteur totale", "Hauteur", "Sols en intersection", "Sols coupés (u)", "Sols coupés (Ids)",
#                 "Sols coupants (u)", "Sols coupants (Ids)", "Sol au-dessus", "Sol au-dessous", "Poteaux en intersection",
#                 "Poteaux coupés (u)", "Poteaux coupés (Ids)", "Poteaux coupants (u)", "Poteaux coupants (Ids)",
#                 "Etat de la jonction", "Valeur de décalage Z", "Justification Z", "Valeur de décalage Y", "Justification Y",
#                 "Justification YZ", "Matériau structurel", "Elévation du niveau de référence", "Elévation en haut",
#                 "Rotation de la section", "Orientation", "Décalage du niveau d'arrivée", "Décalage du niveau de départ",
#                 "Elévation à la base", "Longueur de coupe", "Longueur", "hauteur_section", "largeur_section"],

#     "Poteaux": ["Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif", "AI", "AS",
#                 "Hauteur", "Longueur", "Partie inférieure attachée", "Partie supérieure attachée", "Sols en intersection",
#                 "Sols coupés (u)", "Sols coupés (Ids)", "Sols coupants (u)", "Sols coupants (Ids)", "Poutres en intersection",
#                 "Poutres coupés (u)", "Poutres coupés (Ids)", "Poutres coupants (u)", "Poutres coupants (Ids)",
#                 "Matériau structurel", "Décalage supérieur", "Décalage inférieur", "Diamètre poteau", "h", "b",
#                 "hauteur_section", "largeur_section"]
# }

# # Initialize a dictionary to store filtered dataframes
# cleaned_dataframes = {}

# for df_name, df in dataframes.items():
#     print(f"\n🟢 Original shape of {df_name}: {df.shape}")

#     # Automatically detect the correct category for filtering
#     for category, columns in required_columns.items():
#         if category.lower() in df_name.lower():  # Match dynamically
#             try:
#                 filtered_df = df[columns]  # Keep only the required columns
#             except KeyError as e:
#                 missing_columns = set(columns) - set(df.columns)
#                 print(f"⚠️ Missing columns in {df_name}: {missing_columns}. Skipping this dataframe.")
#                 continue
#             cleaned_dataframes[df_name] = filtered_df
#             print(f"✅ Shape after filtering {df_name}: {filtered_df.shape}")
#             break  # Stop looping once the correct match is found
#     else:
#         print(f"⚠️ No matching category for {df_name}, skipping filtering.")

# # Add prefixes to column names based on the dataframe category and update index
# for name, df in cleaned_dataframes.items():
#     if "murs" in name.lower():
#         prefix = "murs_"
#     elif "sols" in name.lower():
#         prefix = "sols_"
#     elif "poutres" in name.lower():
#         prefix = "poutres_"
#     elif "poteaux" in name.lower():
#         prefix = "poteaux_"
#     else:
#         prefix = ""

#     # Rename columns with the prefix
#     df.rename(columns=lambda col: f"{prefix}{col}" if col.lower() != "id" else f"{prefix}id", inplace=True)

#     # Drop the existing index and set the prefixed ID column as the new index
#     id_column = f"{prefix}id"
#     if id_column in df.columns:
#         df.set_index(id_column, inplace=True)
#         print(f"✅ Set '{id_column}' as index for {name}.")
#     else:
#         print(f"⚠️ '{id_column}' column not found in {name}, skipping index setting.")

    # Update the cleaned_dataframes dictionary
    # cleaned_dataframes[df_name] = df

## Deep-Learning Section

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder, StandardScaler
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout

# # Define target columns
# TARGET_COLUMNS = ['011EC_Lot', '012EC_Ouvrage', '013EC_Localisation', '014EC_Mode Constructif']

# # Combine all dataframes into a single dataset
# combined_df = pd.concat(dataframes.values(), ignore_index=True)

# # Drop rows with missing target values
# combined_df = combined_df.dropna(subset=TARGET_COLUMNS)

# # Separate features and targets
# X = combined_df.drop(columns=TARGET_COLUMNS)
# y = combined_df[TARGET_COLUMNS]

# # Encode categorical target columns
# label_encoders = {}
# for col in TARGET_COLUMNS:
#     le = LabelEncoder()
#     y[col] = le.fit_transform(y[col])
#     label_encoders[col] = le

# # Standardize numerical features
# scaler = StandardScaler()
# X = scaler.fit_transform(X.select_dtypes(include=[np.number]))

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Build the deep learning model
# model = Sequential([
#     Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
#     Dropout(0.3),
#     Dense(64, activation='relu'),
#     Dropout(0.3),
#     Dense(len(TARGET_COLUMNS), activation='softmax')
# ])

# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# # Train the model
# # Convert target columns to numpy arrays for compatibility with the model
# y_train_arrays = [y_train[col].values for col in TARGET_COLUMNS]
# y_test_arrays = [y_test[col].values for col in TARGET_COLUMNS]

# # Train the model
# history = model.fit(
#     X_train,
#     y_train_arrays,
#     epochs=50,
#     batch_size=32,
#     validation_split=0.2
# )

# # Evaluate the model
# loss, accuracy = model.evaluate(X_test, [y_test[col] for col in TARGET_COLUMNS])
# print(f"Test Accuracy: {accuracy}")

i want a machine learning model that is able to detetct relation between all the feateaurs in any dataframe and these columns : TARGET_COLUMNS = ['011EC_Lot', '012EC_Ouvrage', '013EC_Localisation', '014EC_Mode Constructif']
the data is in french and might have integer and float and text maybe you could use a hybride or nlp or anything else but it should be able to predict and auto complet the missing values and even create the columns if not there ... if there are multiples functions seperat them in mutliple cells so if one of them break we are able to repair ... keep the code smart and refined and the show the model results and learning curves with the capacity to train on the data in raw_data and test its capacity at the end on the data in the testing data dir

i want every thing from A to Z  & and make every function in a different cell 

from preprocceing to the models to hyper params and visualing the models and testing on the testing data in test_dir

In [8]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

TARGET_COLUMNS = ['011EC_Lot', '012EC_Ouvrage', '013EC_Localisation', '014EC_Mode Constructif']

def ensure_target_columns(df, target_columns=TARGET_COLUMNS):
    """Ensure all target columns exist in the dataframe, create with NaN if missing."""
    for col in target_columns:
        if col not in df.columns:
            df[col] = np.nan
    return df

def preprocess_dataframe(df, target_columns=TARGET_COLUMNS, fit_encoders=None):
    """
    Preprocess a dataframe:
    - Ensures target columns exist
    - Imputes missing values
    - Encodes categorical/text features
    Returns: X, y, fitted_encoders
    """
    df = ensure_target_columns(df, target_columns)
    y = df[target_columns].copy()
    X = df.drop(columns=target_columns, errors='ignore').copy()

    # Separate types
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # Impute numeric
    num_imputer = SimpleImputer(strategy='mean')
    X[num_cols] = num_imputer.fit_transform(X[num_cols])

    # Impute categorical
    cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
    X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])

    # Encode categorical/text columns
    encoders = fit_encoders or {}
    for col in cat_cols:
        # Use TF-IDF for long text, LabelEncoder for short categorical
        if X[col].astype(str).str.len().mean() > 20:
            # Text column
            if col not in encoders:
                encoders[col] = TfidfVectorizer(max_features=20)
                tfidf = encoders[col].fit_transform(X[col])
            else:
                tfidf = encoders[col].transform(X[col])
            tfidf_df = pd.DataFrame(tfidf.toarray(), columns=[f"{col}_tfidf_{i}" for i in range(tfidf.shape[1])])
            X = X.drop(columns=[col])
            X = pd.concat([X.reset_index(drop=True), tfidf_df], axis=1)
        else:
            # Categorical column
            if col not in encoders:
                encoders[col] = LabelEncoder()
                X[col] = encoders[col].fit_transform(X[col])
            else:
                X[col] = encoders[col].transform(X[col])
    return X, y, encoders
