In [1]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib as plt


In [2]:
base_dir = "../../raw_data/"


## Trying the Poteaux csv files

In [3]:
def import_csv(tableau, files):
    dfs = []

    for i in files:
        csv_path = os.path.join(base_dir, f"maquette{i}", f"{tableau}{i}.csv")
        if os.path.exists(csv_path):
            # Cherche la ligne d'en-tête réelle
            with open(csv_path, encoding="utf-8") as f:
                for idx, line in enumerate(f):
                    if line.startswith("Id;"):
                        header_row = idx
                        break
            try:
                df = pd.read_csv(csv_path, sep=';', decimal=",", header=header_row)
                dfs.append(df)
                print(f"Chargé : {csv_path} ({df.shape[0]} lignes, {df.shape[1]} colonnes)")
                #print(df.head())
            except Exception as e:
                print(f"Erreur de parsing : {csv_path} -> {e}")
        else:
            print(f"Fichier non trouvé : {csv_path}")

    if dfs:
        dfs_concat = pd.concat(dfs, ignore_index=True)
        print(f"Total concaténé : {dfs_concat.shape[0]} lignes, {dfs_concat.shape[1]} colonnes")
    else:
        dfs_concat = pd.DataFrame()
        print("Aucun fichier murs.csv trouvé.")

    return dfs_concat


In [4]:
files = [1,2,3,4,5,6,11,12,13,14,15,16,17,18,19,20]
dfs_concat = import_csv('murs', files)

Chargé : ../../raw_data/maquette1/murs1.csv (1589 lignes, 146 colonnes)
Chargé : ../../raw_data/maquette2/murs2.csv (215 lignes, 149 colonnes)
Chargé : ../../raw_data/maquette3/murs3.csv (203 lignes, 143 colonnes)
Chargé : ../../raw_data/maquette4/murs4.csv (312 lignes, 96 colonnes)
Chargé : ../../raw_data/maquette5/murs5.csv (345 lignes, 94 colonnes)
Chargé : ../../raw_data/maquette6/murs6.csv (203 lignes, 91 colonnes)
Chargé : ../../raw_data/maquette11/murs11.csv (121 lignes, 115 colonnes)
Chargé : ../../raw_data/maquette12/murs12.csv (121 lignes, 118 colonnes)
Chargé : ../../raw_data/maquette13/murs13.csv (797 lignes, 104 colonnes)
Chargé : ../../raw_data/maquette14/murs14.csv (3518 lignes, 107 colonnes)
Chargé : ../../raw_data/maquette15/murs15.csv (1228 lignes, 113 colonnes)
Chargé : ../../raw_data/maquette16/murs16.csv (1088 lignes, 87 colonnes)
Chargé : ../../raw_data/maquette17/murs17.csv (84 lignes, 86 colonnes)
Chargé : ../../raw_data/maquette18/murs18.csv (1743 lignes, 91 co

### FEATURES SELECTION:

In [5]:
all_features = dfs_concat.columns.to_list()

In [6]:
percent_missing = dfs_concat.isnull().sum() * 100 / len(dfs_concat)

missing_value_df = pd.DataFrame({'column_name': dfs_concat.columns,
                                 'percent_missing': percent_missing})

In [7]:
feature_list = missing_value_df.sort_values('percent_missing',
                                            ascending=False)['column_name'].to_list()


In [8]:
selected_features_list = [
    "011EC_Lot",
    "012EC_Ouvrage",
    "013EC_Localisation",
    "014EC_Mode Constructif",
    "Epaisseur",
    "Sols en intersection",
    "Sols coupés (u)",
    "Sols coupants (u)",
    "Sol au-dessus",
    "Sol en-dessous",
    "Fenêtres",
    "Portes",
    "Ouvertures",
    #"Murs imbriqués",
    "Mur multicouche",
    "Profil modifié",
    #"Extension inférieure",
    #"Extension supérieure",
    "Partie inférieure attachée",
    "Partie supérieure attachée",
    "Décalage supérieur",
    "Décalage inférieur",
    "Matériau structurel",
    "Famille et type"
]

targets = [
    "011ec_lot",
    "012ec_ouvrage",
    "013ec_localisation",
    "014ec_mode_constructif"
]

df_clean = dfs_concat[selected_features_list].dropna(axis=0, how='any')
df_clean.shape

(13508, 21)

In [224]:
#ignoring '011EC_Lot' for now / all GO
X = df_clean.drop(columns = ['012EC_Ouvrage','013EC_Localisation','014EC_Mode Constructif','011EC_Lot'])
y = df_clean[['012EC_Ouvrage']]#,'013EC_Localisation','014EC_Mode Constructif','011EC_Lot']]
y.shape, X.shape

((13508, 1), (13508, 17))

In [225]:
X.head(5)

Unnamed: 0,Epaisseur,Sols en intersection,Sols coupés (u),Sols coupants (u),Sol au-dessus,Sol en-dessous,Fenêtres,Portes,Ouvertures,Mur multicouche,Profil modifié,Partie inférieure attachée,Partie supérieure attachée,Décalage supérieur,Décalage inférieur,Matériau structurel,Famille et type
0,0.2,0,0,3,True,True,0,0,0,False,True,0,0,0.0,0.0,ECSA - Béton Voiles,Mur de base: Voile BA 20
1,0.2,0,0,1,False,True,0,0,0,False,True,0,0,0.0,0.0,ECSA - Béton Voiles,Mur de base: Voile BA 20
2,0.2,0,0,3,True,True,0,1,0,False,True,0,0,0.0,0.0,ECSA - Béton Voiles,Mur de base: Voile BA 20
3,0.2,0,0,3,True,True,0,1,0,False,True,0,0,0.0,0.0,ECSA - Béton Voiles,Mur de base: Voile BA 20
4,0.2,0,0,3,False,True,0,0,0,False,True,0,0,0.0,0.0,ECSA - Béton Voiles,Mur de base: Voile BA 20


In [226]:
for feat in X.columns:
    print(f'the uniques for {feat}: {len(X[feat].unique())}')


the uniques for Epaisseur: 29
the uniques for Sols en intersection: 7
the uniques for Sols coupés (u): 2
the uniques for Sols coupants (u): 11
the uniques for Sol au-dessus: 2
the uniques for Sol en-dessous: 2
the uniques for Fenêtres: 13
the uniques for Portes: 11
the uniques for Ouvertures: 3
the uniques for Mur multicouche: 2
the uniques for Profil modifié: 2
the uniques for Partie inférieure attachée: 2
the uniques for Partie supérieure attachée: 2
the uniques for Décalage supérieur: 532
the uniques for Décalage inférieur: 343
the uniques for Matériau structurel: 25
the uniques for Famille et type: 143


In [227]:
for feat in X.columns:
    print(f'the uniques for {feat}: {len(X[feat].unique())} || They are: \n {X[feat].unique()}')


the uniques for Epaisseur: 29 || They are: 
 [0.2  0.3  0.16 0.18 0.25 0.15 0.4  0.35 0.47 0.67 0.17 1.01 0.61 1.28
 0.88 1.51 0.21 0.33 0.1  0.06 0.01 0.22 0.12 0.53 0.34 0.28 0.24 0.26
 0.5 ]
the uniques for Sols en intersection: 7 || They are: 
 [0 1 2 3 4 9 8]
the uniques for Sols coupés (u): 2 || They are: 
 [0 1]
the uniques for Sols coupants (u): 11 || They are: 
 [ 3  1  2  4  5  6  7  0  8 10  9]
the uniques for Sol au-dessus: 2 || They are: 
 [ True False]
the uniques for Sol en-dessous: 2 || They are: 
 [ True False]
the uniques for Fenêtres: 13 || They are: 
 [ 0  1 15 13  2  3  4  8  5 18  6 10  7]
the uniques for Portes: 11 || They are: 
 [ 0  1  2  9  8  3  4  7  6  5 24]
the uniques for Ouvertures: 3 || They are: 
 [0 1 2]
the uniques for Mur multicouche: 2 || They are: 
 [False  True]
the uniques for Profil modifié: 2 || They are: 
 [ True False]
the uniques for Partie inférieure attachée: 2 || They are: 
 [0 1]
the uniques for Partie supérieure attachée: 2 || They are

In [228]:
#Boolean to 0/1 for
bool_feats = ['Sol au-dessus', 'Sol en-dessous','Mur multicouche','Profil modifié']
for feat in bool_feats:
    X[feat] = X[feat].apply(lambda x: 1 if x == True else 0)

# Scaling X

In [229]:
#features to scale:
feats_to_scale = ['Epaisseur','Décalage supérieur','Décalage inférieur']

#minmax scaling X features
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#std_scaler = StandardScaler()

for feat in feats_to_scale:
    minmax_scaler = MinMaxScaler()
    X[feat] = minmax_scaler.fit_transform(X[[feat]])

# Encode

In [230]:
from sklearn.preprocessing import OneHotEncoder

feats_to_encode = ['Matériau structurel', 'Famille et type']

for feat in feats_to_encode:
    ohe = OneHotEncoder(sparse_output=False)
    ohe.fit(X[[feat]])
    X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
    X.drop(columns=[feat], inplace=True)


  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.t

# TARGET WORK

In [231]:
y.sample(3)

Unnamed: 0,012EC_Ouvrage
2354,MUR
8182,MUR
2930,MUR


In [232]:
from sklearn.preprocessing import LabelEncoder

# Encode all columns in y to numerical values
y_encoded = y.copy()
label_encoders = {}

for col in y.columns:
    le = LabelEncoder()
    y_encoded[col] = le.fit_transform(y[col])
    label_encoders[col] = le

y_encoded.head()

Unnamed: 0,012EC_Ouvrage
0,8
1,8
2,8
3,8
4,8


In [None]:
#testing inverse transfor on target classes

y_encoded1 = y.copy()
y1_le = LabelEncoder()
y_encoded1['012EC_Ouvrage'] = y1_le.fit_transform(y['012EC_Ouvrage'])
#y_encoded1['012EC_Ouvrage'] = le

In [29]:
y_encoded1['012EC_Ouvrage'] = y1_le.inverse_transform(y_encoded1['012EC_Ouvrage'])

## DL MODELing


In [233]:
from sklearn.model_selection import train_test_split

In [234]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded,
											test_size=0.3,
											random_state=5)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9455, 183), (4053, 183), (9455, 1), (4053, 1))

In [244]:
len(y['012EC_Ouvrage'].unique())

15

In [69]:
import tensorflow as tf
from tensorflow.keras import Sequential, Input, layers, optimizers, callbacks


In [None]:
model = Sequential()
model.add(Input(shape=(183,)))
model.add(layers.Dense(500, activation='relu'))
model.add(layers.Dense(200, activation='relu'))
model.add(layers.Dense(15, activation='Softmax'))


In [246]:
#### 2. COMPILATION
adam_opt = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=adam_opt,
              metrics=['accuracy'])

In [247]:
model.summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_45 (Dense)            (None, 500)               92000     
                                                                 
 dense_46 (Dense)            (None, 200)               100200    
                                                                 
 dense_47 (Dense)            (None, 15)                3015      
                                                                 
Total params: 195,215
Trainable params: 195,215
Non-trainable params: 0
_________________________________________________________________


In [248]:
es = callbacks.EarlyStopping(patience=10, restore_best_weights=True)

model.fit(X_train, y_train,
          batch_size=16,
          epochs=20,
          validation_split=0.3,
          callbacks=[es],
          verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


<keras.callbacks.History at 0x7fed4866f9d0>

In [287]:
y_pred = model.predict(X_test).shape



In [296]:
y_pred

(4053, 15)

In [292]:
y_train.sample(1)

Unnamed: 0,012EC_Ouvrage
13005,8


In [283]:
y_to_comp = pd.DataFrame(y_test, index = range(9562,9563))
X_to_pred = pd.DataFrame(X_test, index = range(9562,9563))
y_to_comp

Unnamed: 0,012EC_Ouvrage
9562,5


In [None]:
round(pd.DataFrame(model.predict(X_to_pred)),1)



TypeError: type numpy.ndarray doesn't define __round__ method

In [294]:
from sklearn.metrics import classification_report


print(classification_report(y_test, y_pred))


ValueError: Found input variables with inconsistent numbers of samples: [4053, 2]