In [9]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib as plt


In [10]:
base_dir = "../../raw_data/"


In [93]:
dfs = []
tableau = 'murs'
for i in range(1, 7):
    csv_path = os.path.join(base_dir, f"maquette{i}", f"{tableau}{i}.csv")
    if os.path.exists(csv_path):
        # Cherche la ligne d'en-tête réelle
        with open(csv_path, encoding="utf-8") as f:
            for idx, line in enumerate(f):
                if line.startswith("Id;"):
                    header_row = idx
                    break
        try:
            df = pd.read_csv(csv_path, sep=';', decimal=",", header=header_row)
            dfs.append(df)
            print(f"Chargé : {csv_path} ({df.shape[0]} lignes, {df.shape[1]} colonnes)")
            #print(df.head())
        except Exception as e:
            print(f"Erreur de parsing : {csv_path} -> {e}")
    else:
        print(f"Fichier non trouvé : {csv_path}")

if dfs:
    dfs_concat = pd.concat(dfs, ignore_index=True)
    print(f"Total concaténé : {dfs_concat.shape[0]} lignes, {dfs_concat.shape[1]} colonnes")
else:
    dfs_concat = pd.DataFrame()
    print("Aucun fichier murs.csv trouvé.")

dfs_concat.shape

Chargé : ../../raw_data/maquette1/murs1.csv (1589 lignes, 146 colonnes)
Chargé : ../../raw_data/maquette2/murs2.csv (215 lignes, 149 colonnes)
Chargé : ../../raw_data/maquette3/murs3.csv (203 lignes, 143 colonnes)
Chargé : ../../raw_data/maquette4/murs4.csv (312 lignes, 96 colonnes)
Chargé : ../../raw_data/maquette5/murs5.csv (345 lignes, 94 colonnes)
Chargé : ../../raw_data/maquette6/murs6.csv (203 lignes, 91 colonnes)
Total concaténé : 2867 lignes, 160 colonnes


(2867, 160)

In [None]:
colonnes_a_garder = [
    "Epaisseur",
    "Sols en intersection",
    "Sols coupés (u)",
    "Sols coupants (u)",
    "Sol au-dessus",
    "Sol en-dessous",
    "Fenêtres",
    "Portes",
    "Ouvertures",
    "Murs imbriqués",
    "Mur multicouche",
    "Profil modifié",
    "Extension inférieure",
    "Extension supérieure",
    "Partie inférieure attachée",
    "Partie supérieure attachée",
    "Décalage supérieur",
    "Décalage inférieur",
    "Matériau structurel"
]
target_features = ['011EC_Lot',
                   '012EC_Ouvrage',
                   "013EC_Localisation",
                   "014EC_Mode Constructif"]

#other exploratory stuff
to_keep = ['Hauteur','Epaisseur','Sol au-dessus','Sol en-dessous','Fenêtres','Portes','Ouvertures']
to_exp =  ['Volume', 'Surface','Hauteur', 'Longueur','Largeur']
family =  ['Famille et type', 'Famille']

In [122]:
X = dfs_concat[colonnes_a_garder + target_features]

In [123]:
X.dropna(axis=0, how = 'any', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(axis=0, how = 'any', inplace=True)


In [124]:
y = X[target_features]

In [125]:
X = X[colonnes_a_garder]
X.shape

(2865, 19)

In [128]:
X.sample(2)

Unnamed: 0,Epaisseur,Sols en intersection,Sols coupés (u),Sols coupants (u),Sol au-dessus,Sol en-dessous,Fenêtres,Portes,Ouvertures,Murs imbriqués,Mur multicouche,Profil modifié,Extension inférieure,Extension supérieure,Partie inférieure attachée,Partie supérieure attachée,Décalage supérieur,Décalage inférieur,Matériau structurel
2632,0.18,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,-0.2,0.0,<Par catégorie>
632,0.2,0,0,2,1,1,0,0,0,0,0,1,0,0,0,0,0.0,0.0,ECSA - Béton Voiles


### OHE for materials

In [127]:
X['Sol au-dessus'] = X['Sol au-dessus'].apply(lambda x: 0 if x == False else 1)
X['Sol en-dessous'] = X['Sol en-dessous'].apply(lambda x: 0 if x == False else 1)
X['Mur multicouche'] = X['Mur multicouche'].apply(lambda x: 0 if x == False else 1)
X['Profil modifié'] = X['Profil modifié'].apply(lambda x: 0 if x == False else 1)

In [129]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
ohe.fit(X[['Matériau structurel']])

In [130]:
ohe.categories_ , ohe.get_feature_names_out()

([array(['<Par catégorie>', 'Blocs béton manufacturés', 'C25/30',
         'EC - Béton', 'ECSA - Béton Voiles', 'Maçonnerie',
         'Maçonnerie - Voile BA'], dtype=object)],
 array(['Matériau structurel_<Par catégorie>',
        'Matériau structurel_Blocs béton manufacturés',
        'Matériau structurel_C25/30', 'Matériau structurel_EC - Béton',
        'Matériau structurel_ECSA - Béton Voiles',
        'Matériau structurel_Maçonnerie',
        'Matériau structurel_Maçonnerie - Voile BA'], dtype=object))

In [131]:
X[ohe.get_feature_names_out()] = ohe.transform(X[['Matériau structurel']])
X.drop(columns=['Matériau structurel'], inplace=True)

In [132]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
X_scaled = minmax_scaler.fit_transform(X)

In [133]:
#no need for 011EC_Lot right now (all is GO)

y.drop(columns='011EC_Lot', inplace=True)

In [136]:
y['014EC_Mode Constructif'].unique()

array(['POUTRE VOILE', 'BANCHE', 'MACONNERIE', 'COFFRE', 'PREMUR',
       'COULE EN PLACE', 'PREFA CHANTIER'], dtype=object)

In [138]:
y_dict = {'POUTRE VOILE':0,
          'BANCHE':1,
          'MACONNERIE':2,
          'COFFRE':3,
          'PREMUR':4,
       'COULE EN PLACE':5,
       'PREFA CHANTIER':6
       }

y_labeled = y['014EC_Mode Constructif'].apply(lambda x: y_dict[x])
y_labeled

0       0
1       0
2       1
3       1
4       0
       ..
2862    1
2863    1
2864    4
2865    1
2866    4
Name: 014EC_Mode Constructif, Length: 2865, dtype: int64

In [139]:
# Train-Test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_labeled, test_size = 0.30)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2005, 25), (860, 25), (2005,), (860,))

In [140]:
from sklearn.svm import SVC
svc = SVC(kernel='linear', C=10)


In [141]:
svc.fit(X_train,y_train)

In [143]:
y_pred = svc.predict(X_test)

In [144]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(svc, X_test, y_test, cv=5)

In [146]:
cv_results['test_score'].mean()

0.8697674418604653

In [148]:
# equivalent but with SGD solver
from sklearn.linear_model import SGDClassifier

svc_bis = SGDClassifier(loss='hinge', penalty='l2', alpha=1/10)

svc_bis.fit(X_train,y_train)

In [149]:
y_pred2 = svc_bis.predict(X_test)

In [151]:
cv_results2 = cross_validate(svc_bis, X_test, y_test, cv=5)
cv_results2['test_score'].mean()

0.7151162790697675