In [1]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib as plt


In [2]:
base_dir = "../../raw_data/"


## Trying the Poteaux csv files

In [9]:
def import_csv(tableau, files):
    dfs = []

    for i in files:
        csv_path = os.path.join(base_dir, f"maquette{i}", f"{tableau}{i}.csv")
        if os.path.exists(csv_path):
            # Cherche la ligne d'en-tête réelle
            with open(csv_path, encoding="utf-8") as f:
                for idx, line in enumerate(f):
                    if line.startswith("Id;"):
                        header_row = idx
                        break
            try:
                df = pd.read_csv(csv_path, sep=';', decimal=",", header=header_row)
                dfs.append(df)
                print(f"Chargé : {csv_path} ({df.shape[0]} lignes, {df.shape[1]} colonnes)")
                #print(df.head())
            except Exception as e:
                print(f"Erreur de parsing : {csv_path} -> {e}")
        else:
            print(f"Fichier non trouvé : {csv_path}")

    if dfs:
        dfs_concat = pd.concat(dfs, ignore_index=True)
        print(f"Total concaténé : {dfs_concat.shape[0]} lignes, {dfs_concat.shape[1]} colonnes")
    else:
        dfs_concat = pd.DataFrame()
        print("Aucun fichier murs.csv trouvé.")

    return dfs_concat


In [8]:
files = [1,2,3,4,5,6,11,12,13,14,15,16,17,18,19,20]
dfs_concat = import_csv('murs', files)

Chargé : ../../raw_data/maquette1/murs1.csv (1589 lignes, 146 colonnes)
Chargé : ../../raw_data/maquette2/murs2.csv (215 lignes, 149 colonnes)
Chargé : ../../raw_data/maquette3/murs3.csv (203 lignes, 143 colonnes)
Chargé : ../../raw_data/maquette4/murs4.csv (312 lignes, 96 colonnes)
Chargé : ../../raw_data/maquette5/murs5.csv (345 lignes, 94 colonnes)
Chargé : ../../raw_data/maquette6/murs6.csv (203 lignes, 91 colonnes)
Chargé : ../../raw_data/maquette11/murs11.csv (121 lignes, 115 colonnes)
Chargé : ../../raw_data/maquette12/murs12.csv (121 lignes, 118 colonnes)
Chargé : ../../raw_data/maquette13/murs13.csv (797 lignes, 104 colonnes)
Chargé : ../../raw_data/maquette14/murs14.csv (3518 lignes, 107 colonnes)
Chargé : ../../raw_data/maquette15/murs15.csv (1228 lignes, 113 colonnes)
Chargé : ../../raw_data/maquette16/murs16.csv (1088 lignes, 87 colonnes)
Chargé : ../../raw_data/maquette17/murs17.csv (84 lignes, 86 colonnes)
Chargé : ../../raw_data/maquette18/murs18.csv (1743 lignes, 91 co

### FEATURES SELECTION:

In [19]:
all_features = dfs_concat.columns.to_list()

In [27]:
percent_missing = dfs_concat.isnull().sum() * 100 / len(dfs_concat)

missing_value_df = pd.DataFrame({'column_name': dfs_concat.columns,
                                 'percent_missing': percent_missing})

In [32]:
feature_list = missing_value_df.sort_values('percent_missing',
                                            ascending=False)['column_name'].to_list()


In [86]:
selected_features_list = [
    "011EC_Lot",
    "012EC_Ouvrage",
    "013EC_Localisation",
    "014EC_Mode Constructif",
    "Epaisseur",
    "Sols en intersection",
    "Sols coupés (u)",
    "Sols coupants (u)",
    "Sol au-dessus",
    "Sol en-dessous",
    "Fenêtres",
    "Portes",
    "Ouvertures",
    #"Murs imbriqués",
    "Mur multicouche",
    "Profil modifié",
    #"Extension inférieure",
    #"Extension supérieure",
    "Partie inférieure attachée",
    "Partie supérieure attachée",
    "Décalage supérieur",
    "Décalage inférieur",
    "Matériau structurel",
    "Famille et type"
]

targets = [
    "011ec_lot",
    "012ec_ouvrage",
    "013ec_localisation",
    "014ec_mode_constructif"
]

df_clean = dfs_concat[selected_features_list].dropna(axis=0, how='any')
df_clean.shape

(13508, 21)

In [114]:
#ignoring '011EC_Lot' for now / all GO
X = df_clean.drop(columns = ['012EC_Ouvrage','013EC_Localisation','014EC_Mode Constructif','011EC_Lot'])
y = df_clean[['012EC_Ouvrage','013EC_Localisation','014EC_Mode Constructif','011EC_Lot']]
y.shape, X.shape

((13508, 4), (13508, 17))

In [115]:
X.head(5)

Unnamed: 0,Epaisseur,Sols en intersection,Sols coupés (u),Sols coupants (u),Sol au-dessus,Sol en-dessous,Fenêtres,Portes,Ouvertures,Mur multicouche,Profil modifié,Partie inférieure attachée,Partie supérieure attachée,Décalage supérieur,Décalage inférieur,Matériau structurel,Famille et type
0,0.2,0,0,3,True,True,0,0,0,False,True,0,0,0.0,0.0,ECSA - Béton Voiles,Mur de base: Voile BA 20
1,0.2,0,0,1,False,True,0,0,0,False,True,0,0,0.0,0.0,ECSA - Béton Voiles,Mur de base: Voile BA 20
2,0.2,0,0,3,True,True,0,1,0,False,True,0,0,0.0,0.0,ECSA - Béton Voiles,Mur de base: Voile BA 20
3,0.2,0,0,3,True,True,0,1,0,False,True,0,0,0.0,0.0,ECSA - Béton Voiles,Mur de base: Voile BA 20
4,0.2,0,0,3,False,True,0,0,0,False,True,0,0,0.0,0.0,ECSA - Béton Voiles,Mur de base: Voile BA 20


In [116]:
for feat in X.columns:
    print(f'the uniques for {feat}: {len(X[feat].unique())}')


the uniques for Epaisseur: 29
the uniques for Sols en intersection: 7
the uniques for Sols coupés (u): 2
the uniques for Sols coupants (u): 11
the uniques for Sol au-dessus: 2
the uniques for Sol en-dessous: 2
the uniques for Fenêtres: 13
the uniques for Portes: 11
the uniques for Ouvertures: 3
the uniques for Mur multicouche: 2
the uniques for Profil modifié: 2
the uniques for Partie inférieure attachée: 2
the uniques for Partie supérieure attachée: 2
the uniques for Décalage supérieur: 532
the uniques for Décalage inférieur: 343
the uniques for Matériau structurel: 25
the uniques for Famille et type: 143


In [117]:
for feat in X.columns:
    print(f'the uniques for {feat}: {len(X[feat].unique())} || They are: \n {X[feat].unique()}')


the uniques for Epaisseur: 29 || They are: 
 [0.2  0.3  0.16 0.18 0.25 0.15 0.4  0.35 0.47 0.67 0.17 1.01 0.61 1.28
 0.88 1.51 0.21 0.33 0.1  0.06 0.01 0.22 0.12 0.53 0.34 0.28 0.24 0.26
 0.5 ]
the uniques for Sols en intersection: 7 || They are: 
 [0 1 2 3 4 9 8]
the uniques for Sols coupés (u): 2 || They are: 
 [0 1]
the uniques for Sols coupants (u): 11 || They are: 
 [ 3  1  2  4  5  6  7  0  8 10  9]
the uniques for Sol au-dessus: 2 || They are: 
 [ True False]
the uniques for Sol en-dessous: 2 || They are: 
 [ True False]
the uniques for Fenêtres: 13 || They are: 
 [ 0  1 15 13  2  3  4  8  5 18  6 10  7]
the uniques for Portes: 11 || They are: 
 [ 0  1  2  9  8  3  4  7  6  5 24]
the uniques for Ouvertures: 3 || They are: 
 [0 1 2]
the uniques for Mur multicouche: 2 || They are: 
 [False  True]
the uniques for Profil modifié: 2 || They are: 
 [ True False]
the uniques for Partie inférieure attachée: 2 || They are: 
 [0 1]
the uniques for Partie supérieure attachée: 2 || They are

In [118]:
#Boolean to 0/1 for
bool_feats = ['Sol au-dessus', 'Sol en-dessous','Mur multicouche','Profil modifié']
for feat in bool_feats:
    X[feat] = X[feat].apply(lambda x: 1 if x == True else 0)

# Scaling X

In [119]:
#features to scale:
feats_to_scale = ['Epaisseur','Décalage supérieur','Décalage inférieur']

#minmax scaling X features
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#std_scaler = StandardScaler()

for feat in feats_to_scale:
    minmax_scaler = MinMaxScaler()
    X[feat] = minmax_scaler.fit_transform(X[[feat]])

# Encode

In [120]:
from sklearn.preprocessing import OneHotEncoder

feats_to_encode = ['Matériau structurel', 'Famille et type']

for feat in feats_to_encode:
    ohe = OneHotEncoder(sparse_output=False)
    ohe.fit(X[[feat]])
    X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
    X.drop(columns=[feat], inplace=True)


  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
  X[ohe.get_feature_names_out()] = ohe.t

# TARGET WORK

In [124]:
y.sample(3)

Unnamed: 0,012EC_Ouvrage,013EC_Localisation,014EC_Mode Constructif,011EC_Lot
2279,MUR,BASSINS PERIPHERIES,BANCHE,GO
13303,MUR,EXTERIEUR,BANCHE,GO
7216,ACROTERE,EDICULE,PREMUR,GO


In [177]:
y_lot_labels = {}

for i, val in enumerate(y['011EC_Lot'].unique()):
    y_lot_labels[val] = i

y_lot = y['011EC_Lot'].apply(lambda x: y_lot_labels[x])

In [178]:
y_ouvrage_labels = {}

for i, val in enumerate(y['012EC_Ouvrage'].unique()):
    y_ouvrage_labels[val] = i

y_ouvrage = y['012EC_Ouvrage'].apply(lambda x: y_ouvrage_labels[x])

In [151]:
y_local_labels = {}

for i, val in enumerate(y['013EC_Localisation'].unique()):
    y_local_labels[val] = i

y_local = y['013EC_Localisation'].apply(lambda x: y_local_labels[x])

In [152]:
y_construct_labels = {}

for i, val in enumerate(y['014EC_Mode Constructif'].unique()):
    y_construct_labels[val] = i

y_construct = y['014EC_Mode Constructif'].apply(lambda x: y_construct_labels[x])

## MODELS
### for y_lot prediction

In [168]:
from sklearn.model_selection import train_test_split

# y_lot
X_train, X_test, y_lot_train, y_lot_test = train_test_split(X, y_lot,
											test_size=0.3,
											random_state=15)
X_train.shape, X_test.shape, y_lot_train.shape, y_lot_test.shape

((9455, 183), (4053, 183), (9455,), (4053,))

In [172]:
#model for lot / SVC
from sklearn import svm

svm_lot = svm.SVC(kernel='rbf')
svm_lot.fit(X_train, y_lot_train)

pred_lot = svm_lot.predict(X_test)


In [174]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(svm_lot, X_test, y_lot_test, cv=5)

cv_results['test_score'].mean()



0.9987669543773119

In [190]:
from sklearn.metrics import classification_report

y_classes = y['011EC_Lot'].unique().tolist()

print(classification_report(y_lot_test, pred_lot, target_names=y_classes))

              precision    recall  f1-score   support

          GO       1.00      1.00      1.00      3931
      FACADE       1.00      1.00      1.00       119
    EXISTANT       1.00      1.00      1.00         3

    accuracy                           1.00      4053
   macro avg       1.00      1.00      1.00      4053
weighted avg       1.00      1.00      1.00      4053



# Pridicting y_construct

In [234]:
X_train, X_test, y_construct_train, y_construct_test = train_test_split(X, y_construct,
											test_size=0.32,
											random_state=15)
X_train.shape, X_test.shape, y_construct_train.shape, y_construct_test.shape

((9185, 183), (4323, 183), (9185,), (4323,))

In [235]:
svm_construct = svm.SVC(kernel='rbf')
svm_construct.fit(X_train, y_construct_train)

pred_construct = svm_construct.predict(X_test)

In [236]:
cv_results_construct = cross_validate(svm_construct, X_test, y_construct_test, cv=5)
cv_results_construct['test_score'].mean()



0.874390655105973

In [237]:
print(classification_report(y_construct_test, pred_construct))

              precision    recall  f1-score   support

           0       1.00      0.03      0.05        75
           1       0.87      0.98      0.92      3013
           2       0.99      0.87      0.93       308
           3       0.00      0.00      0.00         8
           4       0.89      0.59      0.71       635
           5       1.00      0.22      0.36         9
           6       1.00      1.00      1.00        17
           7       1.00      1.00      1.00       126
           8       1.00      0.55      0.71        29
           9       0.00      0.00      0.00         1
          10       0.67      0.42      0.52        71
          11       1.00      0.90      0.95        10
          12       0.00      0.00      0.00        12
          14       1.00      0.43      0.60         7
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1

    accuracy                           0.88      4323
   macro avg       0.65   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Pridicting y_local

In [228]:
X_train, X_test, y_local_train, y_local_test = train_test_split(X, y_local,
											test_size=0.33,
											random_state=15)
X_train.shape, X_test.shape, y_local_train.shape, y_local_test.shape

((9050, 183), (4458, 183), (9050,), (4458,))

In [229]:
svm_local = svm.SVC(kernel='rbf')
svm_local.fit(X_train, y_local_train)

pred_local = svm_local.predict(X_test)

In [230]:
cv_results_local = cross_validate(svm_local, X_test, y_local_test, cv=5)
cv_results_local['test_score'].mean()



0.8326649655498685

In [231]:
y_local_train.unique(), set(pred_local)

(array([ 0,  3,  1, 11, 14,  9,  2,  4,  5, 10, 15,  6, 13, 16,  7, 12,  8]),
 {0, 1, 2, 3, 4, 6, 7, 9, 11, 14})

In [232]:
print(classification_report(y_local_test, pred_local))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90      2267
           1       0.83      0.81      0.82      1462
           2       0.80      0.46      0.59        84
           3       0.76      0.85      0.81       284
           4       0.72      0.70      0.71        37
           6       1.00      0.60      0.75        10
           7       1.00      0.50      0.67         2
           8       0.00      0.00      0.00         1
           9       0.82      0.42      0.55       134
          11       1.00      1.00      1.00       128
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00        14
          14       1.00      0.18      0.30        17
          15       0.00      0.00      0.00        13
          16       0.00      0.00      0.00         3

    accuracy                           0.85      4458
   macro avg       0.59      0.43      0.47      4458
weighted avg       0.84   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Pridicting y_ouvrage

In [213]:
X_train, X_test, y_ouvrage_train, y_ouvrage_test = train_test_split(X, y_ouvrage,
											test_size=0.33,
											random_state=5)
X_train.shape, X_test.shape, y_ouvrage_train.shape, y_ouvrage_test.shape

((9050, 183), (4458, 183), (9050,), (4458,))

In [214]:
#model for Ouvrage / SVC

svm_ouvrage = svm.SVC(kernel='rbf')
svm_ouvrage.fit(X_train, y_ouvrage_train)

pred_ouvrage = svm_ouvrage.predict(X_test)

In [210]:
cv_results_ouvrage = cross_validate(svm_ouvrage, X_test, y_ouvrage_test, cv=5)
cv_results_ouvrage['test_score'].mean()



0.9338764823187347

In [216]:
set(pred_ouvrage)

{0, 2, 3, 4, 6, 11, 14}

In [217]:
print(classification_report(y_ouvrage_test, pred_ouvrage))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      3951
           1       0.00      0.00      0.00        11
           2       1.00      0.71      0.83         7
           3       1.00      0.17      0.29       138
           4       0.95      0.47      0.63       177
           6       0.60      0.44      0.51        34
           7       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       1.00      1.00      1.00       116
          12       0.00      0.00      0.00         9
          13       0.00      0.00      0.00         6
          14       1.00      1.00      1.00         3

    accuracy                           0.94      4458
   macro avg       0.50      0.37      0.40      4458
weighted avg       0.93      0.94      0.92      4458



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
