In [2]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib as plt


In [3]:
base_dir = "../../raw_data/"


## Trying the Poteaux csv files

In [4]:
def import_csv(tableau, files):
    dfs = []

    for i in files:
        csv_path = os.path.join(base_dir, f"maquette{i}", f"{tableau}{i}.csv")
        if os.path.exists(csv_path):
            # Cherche la ligne d'en-tête réelle
            with open(csv_path, encoding="utf-8") as f:
                for idx, line in enumerate(f):
                    if line.startswith("Id;"):
                        header_row = idx
                        break
            try:
                df = pd.read_csv(csv_path, sep=';', decimal=",", header=header_row)
                dfs.append(df)
                print(f"Chargé : {csv_path} ({df.shape[0]} lignes, {df.shape[1]} colonnes)")
                #print(df.head())
            except Exception as e:
                print(f"Erreur de parsing : {csv_path} -> {e}")
        else:
            print(f"Fichier non trouvé : {csv_path}")

    if dfs:
        dfs_concat = pd.concat(dfs, ignore_index=True)
        print(f"Total concaténé : {dfs_concat.shape[0]} lignes, {dfs_concat.shape[1]} colonnes")
    else:
        dfs_concat = pd.DataFrame()
        print("Aucun fichier murs.csv trouvé.")

    return dfs_concat


In [5]:
files = [1,2,3,4,5,6,11,12,13,14,15,16,17,18,19,20]
dfs_concat = import_csv('murs', files)

Chargé : ../../raw_data/maquette1/murs1.csv (1589 lignes, 146 colonnes)
Chargé : ../../raw_data/maquette2/murs2.csv (215 lignes, 149 colonnes)
Chargé : ../../raw_data/maquette3/murs3.csv (203 lignes, 143 colonnes)
Chargé : ../../raw_data/maquette4/murs4.csv (312 lignes, 96 colonnes)
Chargé : ../../raw_data/maquette5/murs5.csv (345 lignes, 94 colonnes)
Chargé : ../../raw_data/maquette6/murs6.csv (203 lignes, 91 colonnes)
Chargé : ../../raw_data/maquette11/murs11.csv (121 lignes, 115 colonnes)
Chargé : ../../raw_data/maquette12/murs12.csv (121 lignes, 118 colonnes)
Chargé : ../../raw_data/maquette13/murs13.csv (797 lignes, 104 colonnes)
Chargé : ../../raw_data/maquette14/murs14.csv (3518 lignes, 107 colonnes)
Chargé : ../../raw_data/maquette15/murs15.csv (1228 lignes, 113 colonnes)
Chargé : ../../raw_data/maquette16/murs16.csv (1088 lignes, 87 colonnes)
Chargé : ../../raw_data/maquette17/murs17.csv (84 lignes, 86 colonnes)
Chargé : ../../raw_data/maquette18/murs18.csv (1743 lignes, 91 co

### FEATURES SELECTION:

In [6]:
all_features = dfs_concat.columns.to_list()

In [7]:
percent_missing = dfs_concat.isnull().sum() * 100 / len(dfs_concat)

missing_value_df = pd.DataFrame({'column_name': dfs_concat.columns,
                                 'percent_missing': percent_missing})

In [8]:
feature_list = missing_value_df.sort_values('percent_missing',
                                            ascending=False)['column_name'].to_list()


In [9]:
dfs_concat.keys().to_list()

['Id',
 '011EC_Lot',
 '012EC_Ouvrage',
 '013EC_Localisation',
 '014EC_Mode Constructif',
 'Nom',
 'Hauteur',
 'Epaisseur',
 'AI',
 'AS',
 'Sols en intersection',
 'Sols coupés (u)',
 'Sols coupés (Ids)',
 'Sols coupants (u)',
 'Sols coupants (Ids)',
 'Sol au-dessus',
 'Sol en-dessous',
 'Fenêtres',
 'Portes',
 'Ouvertures',
 'Murs imbriqués',
 'Mur multicouche',
 'Mur empilé',
 'Profil modifié',
 'Image',
 'Catégorie',
 'Section',
 "Type prédéfini d'IFC",
 'Exporter au format IFC sous',
 'Exporter au format IFC',
 'IfcGUID',
 'A une association',
 "Enrobage d'armature - Autres faces",
 "Enrobage d'armature - Face intérieure",
 "Enrobage d'armature - Face extérieure",
 'Variantes',
 'Extension inférieure',
 'Extension supérieure',
 'Volume',
 'Surface',
 'Phase de démolition',
 'Phase de création',
 'Commentaires',
 'Longueur',
 'Famille et type',
 'Famille',
 'Type',
 'Nom de la famille',
 'Nom du type',
 'ID du type',
 'Lié au volume',
 'Structure',
 'Identifiant',
 'Ligne de justific

In [10]:
selected_features_list = [
    "011EC_Lot",
    "012EC_Ouvrage",
    "013EC_Localisation",
    "014EC_Mode Constructif",
    "Epaisseur",
    "Sols en intersection",
    "Sols coupés (u)",
    "Sols coupants (u)",
    "Sol au-dessus",
    "Sol en-dessous",
    "Fenêtres",
    "Portes",
    "Ouvertures",
    #"Murs imbriqués",
    "Mur multicouche",
    "Profil modifié",
    #"Extension inférieure",
    #"Extension supérieure",
    "Partie inférieure attachée",
    "Partie supérieure attachée",
    "Décalage supérieur",
    "Décalage inférieur",
    "Matériau structurel",
    "Famille et type"
]

targets = [
    "011ec_lot",
    "012ec_ouvrage",
    "013ec_localisation",
    "014ec_mode_constructif"
]

df_clean = dfs_concat[selected_features_list].dropna(axis=0, how='any')
df_clean.shape

(13508, 21)

In [11]:
#ignoring '011EC_Lot' for now / all GO
X = df_clean.drop(columns = ['012EC_Ouvrage','013EC_Localisation','014EC_Mode Constructif','011EC_Lot'])
y = df_clean[['011EC_Lot']]#,'013EC_Localisation','014EC_Mode Constructif', '012EC_Ouvrage','011EC_Lot']]
X.shape, y.shape

((13508, 17), (13508, 1))

In [12]:
for feat in X.columns:
    print(f'the uniques for {feat}: {len(X[feat].unique())}')


the uniques for Epaisseur: 29
the uniques for Sols en intersection: 7
the uniques for Sols coupés (u): 2
the uniques for Sols coupants (u): 11
the uniques for Sol au-dessus: 2
the uniques for Sol en-dessous: 2
the uniques for Fenêtres: 13
the uniques for Portes: 11
the uniques for Ouvertures: 3
the uniques for Mur multicouche: 2
the uniques for Profil modifié: 2
the uniques for Partie inférieure attachée: 2
the uniques for Partie supérieure attachée: 2
the uniques for Décalage supérieur: 532
the uniques for Décalage inférieur: 343
the uniques for Matériau structurel: 25
the uniques for Famille et type: 143


In [13]:
for feat in X.columns:
    print(f'the uniques for {feat}: {len(X[feat].unique())} || They are: \n {X[feat].unique()}')


the uniques for Epaisseur: 29 || They are: 
 [0.2  0.3  0.16 0.18 0.25 0.15 0.4  0.35 0.47 0.67 0.17 1.01 0.61 1.28
 0.88 1.51 0.21 0.33 0.1  0.06 0.01 0.22 0.12 0.53 0.34 0.28 0.24 0.26
 0.5 ]
the uniques for Sols en intersection: 7 || They are: 
 [0 1 2 3 4 9 8]
the uniques for Sols coupés (u): 2 || They are: 
 [0 1]
the uniques for Sols coupants (u): 11 || They are: 
 [ 3  1  2  4  5  6  7  0  8 10  9]
the uniques for Sol au-dessus: 2 || They are: 
 [ True False]
the uniques for Sol en-dessous: 2 || They are: 
 [ True False]
the uniques for Fenêtres: 13 || They are: 
 [ 0  1 15 13  2  3  4  8  5 18  6 10  7]
the uniques for Portes: 11 || They are: 
 [ 0  1  2  9  8  3  4  7  6  5 24]
the uniques for Ouvertures: 3 || They are: 
 [0 1 2]
the uniques for Mur multicouche: 2 || They are: 
 [False  True]
the uniques for Profil modifié: 2 || They are: 
 [ True False]
the uniques for Partie inférieure attachée: 2 || They are: 
 [0 1]
the uniques for Partie supérieure attachée: 2 || They are

In [14]:
#Boolean to 0/1 for
bool_feats = ['Sol au-dessus', 'Sol en-dessous','Mur multicouche','Profil modifié']
for feat in bool_feats:
    X[feat] = X[feat].apply(lambda x: 1 if x == True else 0)

# Scaling X

In [15]:
#features to scale:
feats_to_scale = ['Epaisseur','Décalage supérieur','Décalage inférieur']

#minmax scaling X features
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#std_scaler = StandardScaler()

for feat in feats_to_scale:
    minmax_scaler = MinMaxScaler()
    X[feat] = minmax_scaler.fit_transform(X[[feat]])

# Encode

In [16]:
from sklearn.preprocessing import OneHotEncoder

feats_to_encode = ['Matériau structurel', 'Famille et type']

for feat in feats_to_encode:
    ohe = OneHotEncoder(sparse_output=False)
    ohe.fit(X[[feat]])
    X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
    X.drop(columns=[feat], inplace=True)


  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.t

# TARGET WORK

In [17]:
y.sample(3)

Unnamed: 0,011EC_Lot
9653,GO
12386,GO
139,GO


In [18]:
from sklearn.preprocessing import LabelEncoder

# Encode all columns in y to numerical values
y_encoded = y.copy()
label_encoders = {}

for col in y.columns:
    le = LabelEncoder()
    y_encoded[col] = le.fit_transform(y[col])
    label_encoders[col] = le

y_encoded.head()

Unnamed: 0,011EC_Lot
0,2
1,2
2,2
3,2
4,2


## DL MODELing


In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded,
											test_size=0.3,
											random_state=5)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9455, 183), (4053, 183), (9455, 1), (4053, 1))

In [27]:
feat_len = len(y['011EC_Lot'].unique())

In [23]:
import tensorflow as tf
from tensorflow.keras import Sequential, Input, layers, optimizers, callbacks


2025-06-06 12:58:07.740439: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-06 12:58:07.832544: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-06-06 12:58:07.832556: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-06-06 12:58:07.853138: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-06 12:58:08.443737: W tensorflow/stream_executor/platform/de

In [28]:

model = Sequential()
model.add(Input(shape=(183,)))
model.add(layers.Dense(1000, activation='relu'))
model.add(layers.Dense(500, activation='relu'))
model.add(layers.Dense(50, activation='relu'))
model.add(layers.Dense(feat_len, activation='Softmax'))

In [29]:
#### 2. COMPILATION
adam_opt = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=adam_opt,
              metrics=['accuracy'])

In [30]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 1000)              184000    
                                                                 
 dense_11 (Dense)            (None, 500)               500500    
                                                                 
 dense_12 (Dense)            (None, 50)                25050     
                                                                 
 dense_13 (Dense)            (None, 3)                 153       
                                                                 
Total params: 709,703
Trainable params: 709,703
Non-trainable params: 0
_________________________________________________________________


In [31]:
es = callbacks.EarlyStopping(patience=12, restore_best_weights=True)

model.fit(X_train, y_train,
          batch_size=16,
          epochs=80,
          validation_split=0.3,
          callbacks=[es],
          verbose=1)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80


<keras.callbacks.History at 0x749b542822c0>

In [32]:
y_pred = model.predict(X_test)



In [36]:
from keras.utils import to_categorical

y_test_cat = to_categorical(y_test['feat_len'], num_classes=3)

#y_pred_ready = pd.DataFrame(np.argmax(y_pred, axis=1))

KeyError: 'feat_len'

In [38]:
y_pred_ready = pd.DataFrame(np.argmax(y_pred, axis=1))

In [39]:
model.evaluate(X_test, y_test, verbose=1)



[0.004017095547169447, 0.9980261325836182]

In [506]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_ready))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.86      0.46      0.60       105
           2       0.00      0.00      0.00         3
           3       0.47      0.89      0.62         9
           4       0.87      0.74      0.80       283
           6       1.00      1.00      1.00       100
           7       0.60      0.18      0.27        17
           8       0.00      0.00      0.00         9
           9       0.39      1.00      0.56        11
          10       0.00      0.00      0.00         1
          11       0.76      0.94      0.84      1358
          12       0.00      0.00      0.00         6
          13       0.93      0.85      0.89      2032
          14       0.88      0.37      0.52        81
          15       0.93      0.81      0.87        32
          16       0.00      0.00      0.00         2

    accuracy                           0.85      4053
   macro avg       0.48   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
