In [1]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib as plt


In [2]:
base_dir = "../../raw_data/"


## Trying the Poteaux csv files

In [None]:
dfs = []
tableau = 'poteaux'
for i in range(1, 7):
    csv_path = os.path.join(base_dir, f"maquette{i}", f"{tableau}{i}.csv")
    if os.path.exists(csv_path):
        # Cherche la ligne d'en-tête réelle
        with open(csv_path, encoding="utf-8") as f:
            for idx, line in enumerate(f):
                if line.startswith("Id;"):
                    header_row = idx
                    break
        try:
            df = pd.read_csv(csv_path, sep=';', decimal=",", header=header_row)
            dfs.append(df)
            print(f"Chargé : {csv_path} ({df.shape[0]} lignes, {df.shape[1]} colonnes)")
            #print(df.head())
        except Exception as e:
            print(f"Erreur de parsing : {csv_path} -> {e}")
    else:
        print(f"Fichier non trouvé : {csv_path}")

if dfs:
    dfs_concat = pd.concat(dfs, ignore_index=True)
    print(f"Total concaténé : {dfs_concat.shape[0]} lignes, {dfs_concat.shape[1]} colonnes")
else:
    dfs_concat = pd.DataFrame()
    print("Aucun fichier murs.csv trouvé.")

dfs_concat.shape

Chargé : ../../raw_data/maquette1/poteaux1.csv (215 lignes, 110 colonnes)
Chargé : ../../raw_data/maquette2/poteaux2.csv (72 lignes, 111 colonnes)
Chargé : ../../raw_data/maquette3/poteaux3.csv (115 lignes, 113 colonnes)
Chargé : ../../raw_data/maquette4/poteaux4.csv (68 lignes, 87 colonnes)
Fichier non trouvé : ../../raw_data/maquette5/poteaux5.csv
Chargé : ../../raw_data/maquette6/poteaux6.csv (115 lignes, 83 colonnes)
Total concaténé : 2867 lignes, 160 colonnes


(585, 144)

In [30]:
dfs_concat

Unnamed: 0,Id,011EC_Lot,012EC_Ouvrage,013EC_Localisation,014EC_Mode Constructif,Nom,Image,Style de poteau,Catégorie,Type prédéfini d'IFC,...,Batiment,NIVEAU_STRUCTURE,Nature_Ouvrage,Diamètre poteau,hauteur_section,largeur_section,Décalage de l'attachement à la base,Justification de l'attachement en bas,Décalage de l'attachement en haut,Justification de l'attachement en haut
0,846158,GO,POTEAU RECTANGULAIRE,,BANCHE,110x30,<Aucun>,0,Poteaux porteurs,,...,,,,,,,,,,
1,846164,GO,POTEAU RECTANGULAIRE,,BANCHE,110x30,<Aucun>,0,Poteaux porteurs,,...,,,,,,,,,,
2,846166,GO,POTEAU RECTANGULAIRE,,BANCHE,110x30,<Aucun>,0,Poteaux porteurs,,...,,,,,,,,,,
3,846168,GO,POTEAU RECTANGULAIRE,,BANCHE,110x30,<Aucun>,0,Poteaux porteurs,,...,,,,,,,,,,
4,846170,GO,POTEAU RECTANGULAIRE,,BANCHE,110x30,<Aucun>,0,Poteaux porteurs,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,385396,GO,POTEAU CIRCULAIRE,EXTERIEUR,BANCHE,Ø 30,<Aucun>,0,Poteaux porteurs,,...,,,,,,,,,,
581,386518,GO,POTEAU CIRCULAIRE,EXTERIEUR,BANCHE,Ø 30,<Aucun>,0,Poteaux porteurs,,...,,,,,,,,,,
582,386528,GO,POTEAU CIRCULAIRE,EXTERIEUR,BANCHE,Ø 30,<Aucun>,0,Poteaux porteurs,,...,,,,,,,,,,
583,386542,GO,POTEAU CIRCULAIRE,EXTERIEUR,BANCHE,Ø 30,<Aucun>,0,Poteaux porteurs,,...,,,,,,,,,,


### FEATURES SELECTION:

In [None]:

all_features = [
        "Id", "011EC_Lot", "012EC_Ouvrage", "013EC_Localisation", "014EC_Mode Constructif",
        "Nom", "AI", "AS", "Hauteur", "Longueur",
        "Partie inférieure attachée", "Partie supérieure attachée",
        "Sols en intersection", "Sols coupés (u)", "Sols coupés (Ids)",
        "Sols coupants (u)", "Sols coupants (Ids)", "Poutres en intersection",
        "Poutres coupés (u)", "Poutres coupés (Ids)", "Poutres coupants (u)",
        "Poutres coupants (Ids)", "Matériau structurel", "Marque d'emplacement du poteau",
        "Décalage supérieur", "Décalage inférieur",
        "Longueur", "Sols coupés (Ids)", "Sols coupants (Ids)",
        "Poutres coupés (Ids)", "Poutres coupants (Ids)",
]

df = dfs_concat[all_features]
df.shape

(585, 31)

In [None]:
uniq = []
for feature in all_features:
    feats = dfs_concat[feature].unique()
    uniq.append(f'{feature} : {len(feats)} +++ : {feats}' )


In [39]:
uniq

['Id : 470 +++ : [ 846158  846164  846166  846168  846170  846172  846174  846176  846178\n  846180  846182  846184  846186  846188  846190  846192  846194  846196\n  846198  846200  846202  846224  846226  846228  846230  846232  846234\n  846266 1165467 1165469 1165471 1165473 1165475 1165477 1165479 1165481\n 1165483 1165485 1165487 1165489 1165491 1165493 1165495 1165497 1165499\n 1165501 1165503 1165505 1165507 1165509 1165511 1165513 1165515 1165517\n 1165519 1165521 1165523 1165525 1165529 1165539 1165541 1165543 1165545\n 1165547 1165549 1165551 1165553 1238585 1238587 1238589 1238591 1238593\n 1238595 1238597 1238599 1238601 1238603 1238605 1238607 1238609 1238611\n 1238613 1238615 1238617 1238619 1238621 1238623 1238625 1238627 1238629\n 1238631 1238633 1238635 1238637 1238639 1238641 1238643 1238645 1238647\n 1238649 1238651 1238653 1238655 1400695 1401284 1401540 1401730 1473399\n 1473401 1473403 1473405 1473407 1473409 1473411 1473413 1473415 1473417\n 1473419 1473421 1473

In [48]:
percent_missing = dfs_concat[all_features].isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

In [55]:
feature_list = missing_value_df.sort_values('percent_missing', ascending=False)['column_name'].to_list()

In [84]:
selected_feature_list =  [ '013EC_Localisation',
                            'Décalage inférieur',
                            '012EC_Ouvrage',
                            '014EC_Mode Constructif',
                            'Matériau structurel',
                            'Décalage supérieur',
                            'Nom',
                            '011EC_Lot',
                            'Longueur',
                            'Famille et type'
                            ]
dfs_concat[selected_feature_list].sample(2)

Unnamed: 0,013EC_Localisation,Décalage inférieur,012EC_Ouvrage,014EC_Mode Constructif,Matériau structurel,Décalage supérieur,Nom,011EC_Lot,Longueur,Famille et type
286,INTERIEUR,0.0,POTEAU RECTANGULAIRE,BANCHE,EC - Béton,-0.3,25x60,GO,2.72,Poteau béton - Rectangulaire: 25x60
561,EXTERIEUR,0.0,POTEAU RECTANGULAIRE,CARTON,"Concrete, Cast-in-Place gray",0.0,400x400,GO,12.237533,EIF_STR - Poteau rect: 400x400


In [118]:
df_clean = dfs_concat[selected_feature_list].dropna(axis=0, how='any')
df_clean.shape

(519, 10)

In [142]:
#ignoring '011EC_Lot' for now / all GO
X = df_clean.drop(columns = ['012EC_Ouvrage','013EC_Localisation','014EC_Mode Constructif','011EC_Lot'])
y = df_clean[['012EC_Ouvrage','013EC_Localisation','014EC_Mode Constructif']]
y.shape, X.shape

((519, 3), (519, 6))

In [143]:
y

Unnamed: 0,012EC_Ouvrage,013EC_Localisation,014EC_Mode Constructif
22,POTEAU RECTANGULAIRE,INTERIEUR,BANCHE
67,POTEAU RECTANGULAIRE,EXTERIEUR,PREFA CHANTIER
68,POTEAU RECTANGULAIRE,EXTERIEUR,PREFA CHANTIER
69,POTEAU RECTANGULAIRE,EXTERIEUR,PREFA CHANTIER
70,POTEAU RECTANGULAIRE,EXTERIEUR,PREFA CHANTIER
...,...,...,...
580,POTEAU CIRCULAIRE,EXTERIEUR,BANCHE
581,POTEAU CIRCULAIRE,EXTERIEUR,BANCHE
582,POTEAU CIRCULAIRE,EXTERIEUR,BANCHE
583,POTEAU CIRCULAIRE,EXTERIEUR,BANCHE


In [144]:
from sklearn.preprocessing import OneHotEncoder

for feat in X.columns:
    print(f'the uniques for {feat}: {len(X[feat].unique())}')


the uniques for Décalage inférieur: 13
the uniques for Matériau structurel: 6
the uniques for Décalage supérieur: 36
the uniques for Nom: 29
the uniques for Longueur: 62
the uniques for Famille et type: 30


In [145]:
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(X[['Matériau structurel']])

In [146]:
ohe.categories_
ohe.get_feature_names_out()


array(['Matériau structurel_<Par catégorie>',
       'Matériau structurel_C35/45',
       'Matériau structurel_Concrete, Cast-in-Place gray',
       'Matériau structurel_EC - Béton',
       'Matériau structurel_ECSA - Béton Poteaux',
       'Matériau structurel_Pin'], dtype=object)

In [147]:
feats = ['Matériau structurel','Famille et type']
for feat in feats:
    print(f'Transforming {feat}: {len(X[feat].unique())} values')
    ohe = OneHotEncoder(sparse_output=False)
    ohe.fit(X[[feat]])
    X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
    X.drop(columns=[feat], inplace=True)


Transforming Matériau structurel: 6 values
Transforming Famille et type: 30 values


In [148]:
X.shape

(519, 40)

In [149]:
y['012EC_Ouvrage'].unique()

array(['POTEAU RECTANGULAIRE', 'POTEAU PARTICULIER', 'POTEAU CIRCULAIRE',
       'MUR'], dtype=object)

In [150]:
labeler2 = {'POTEAU RECTANGULAIRE':0,
           'POTEAU PARTICULIER':1,
           'POTEAU CIRCULAIRE':2,
           'MUR':3,
}
y['ouvrage'] = y['012EC_Ouvrage'].apply(lambda x: labeler2[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['ouvrage'] = y['012EC_Ouvrage'].apply(lambda x: labeler2[x])


In [151]:
y['013EC_Localisation'].unique()

array(['INTERIEUR', 'EXTERIEUR', 'COURANT', 'BASSINS PERIPHERIES'],
      dtype=object)

In [152]:
labeler2 = {'INTERIEUR':0,
           'EXTERIEUR':1,
           'COURANT':2,
           'BASSINS PERIPHERIES':3,
}
y['local'] = y['013EC_Localisation'].apply(lambda x: labeler2[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['local'] = y['013EC_Localisation'].apply(lambda x: labeler2[x])


In [153]:
y['014EC_Mode Constructif'].unique()

array(['BANCHE', 'PREFA CHANTIER', 'PREFA USINE', 'CARTON'], dtype=object)

In [154]:
y['014EC_Mode Constructif'].unique()
labeler1 = {'BANCHE':0,
           'PREFA CHANTIER':1,
           'PREFA USINE':2,
           'CARTON':3,
       }
y['const'] = y['014EC_Mode Constructif'].apply(lambda x: labeler1[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['const'] = y['014EC_Mode Constructif'].apply(lambda x: labeler1[x])


In [139]:
y.sample(1)

Unnamed: 0,012EC_Ouvrage,013EC_Localisation,014EC_Mode Constructif,local,Ouvrage,construc
515,POTEAU RECTANGULAIRE,INTERIEUR,CARTON,0,0,3


In [164]:
from sklearn.dummy import DummyClassifier


dummy_clf_local = DummyClassifier(strategy="stratified")
dummy_clf_local.fit(X, y.local)


In [165]:
set(dummy_clf_local.predict(X))


{0, 1, 2, 3}

In [None]:
dummy_clf_local.score(X, y.local)

0.49710982658959535

In [191]:
dummy_clf_construc = DummyClassifier(strategy="stratified")
dummy_clf_construc.fit(X, y.const)

In [192]:
set(dummy_clf_construc.predict(X))

{0, 1, 2, 3}

In [193]:
dummy_clf_construc.score(X, y.const)

0.35645472061657035

In [180]:
dummy_clf_ouvrage = DummyClassifier(strategy="stratified")
dummy_clf_ouvrage.fit(X, y.ouvrage)

In [187]:
y_pred = dummy_clf_ouvrage.predict(X)

In [189]:
dummy_clf_ouvrage.score(y_pred, y.const)

0.35260115606936415