In [1]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib as plt


In [2]:
base_dir = "../../raw_data/"


## Trying the Poteaux csv files

In [9]:
def import_csv(tableau, files):
    dfs = []

    for i in files:
        csv_path = os.path.join(base_dir, f"maquette{i}", f"{tableau}{i}.csv")
        if os.path.exists(csv_path):
            # Cherche la ligne d'en-tête réelle
            with open(csv_path, encoding="utf-8") as f:
                for idx, line in enumerate(f):
                    if line.startswith("Id;"):
                        header_row = idx
                        break
            try:
                df = pd.read_csv(csv_path, sep=';', decimal=",", header=header_row)
                dfs.append(df)
                print(f"Chargé : {csv_path} ({df.shape[0]} lignes, {df.shape[1]} colonnes)")
                #print(df.head())
            except Exception as e:
                print(f"Erreur de parsing : {csv_path} -> {e}")
        else:
            print(f"Fichier non trouvé : {csv_path}")

    if dfs:
        dfs_concat = pd.concat(dfs, ignore_index=True)
        print(f"Total concaténé : {dfs_concat.shape[0]} lignes, {dfs_concat.shape[1]} colonnes")
    else:
        dfs_concat = pd.DataFrame()
        print("Aucun fichier murs.csv trouvé.")

    return dfs_concat


In [8]:
files = [1,2,3,4,5,6,11,12,13,14,15,16,17,18,19,20]
dfs_concat = import_csv('murs', files)

Chargé : ../../raw_data/maquette1/murs1.csv (1589 lignes, 146 colonnes)
Chargé : ../../raw_data/maquette2/murs2.csv (215 lignes, 149 colonnes)
Chargé : ../../raw_data/maquette3/murs3.csv (203 lignes, 143 colonnes)
Chargé : ../../raw_data/maquette4/murs4.csv (312 lignes, 96 colonnes)
Chargé : ../../raw_data/maquette5/murs5.csv (345 lignes, 94 colonnes)
Chargé : ../../raw_data/maquette6/murs6.csv (203 lignes, 91 colonnes)
Chargé : ../../raw_data/maquette11/murs11.csv (121 lignes, 115 colonnes)
Chargé : ../../raw_data/maquette12/murs12.csv (121 lignes, 118 colonnes)
Chargé : ../../raw_data/maquette13/murs13.csv (797 lignes, 104 colonnes)
Chargé : ../../raw_data/maquette14/murs14.csv (3518 lignes, 107 colonnes)
Chargé : ../../raw_data/maquette15/murs15.csv (1228 lignes, 113 colonnes)
Chargé : ../../raw_data/maquette16/murs16.csv (1088 lignes, 87 colonnes)
Chargé : ../../raw_data/maquette17/murs17.csv (84 lignes, 86 colonnes)
Chargé : ../../raw_data/maquette18/murs18.csv (1743 lignes, 91 co

### FEATURES SELECTION:

In [19]:
all_features = dfs_concat.columns.to_list()

In [27]:
percent_missing = dfs_concat.isnull().sum() * 100 / len(dfs_concat)

missing_value_df = pd.DataFrame({'column_name': dfs_concat.columns,
                                 'percent_missing': percent_missing})

In [32]:
feature_list = missing_value_df.sort_values('percent_missing',
                                            ascending=False)['column_name'].to_list()


In [49]:
selected_feature_list =  [ '013EC_Localisation',
                            'Décalage inférieur',
                            '012EC_Ouvrage',
                            '014EC_Mode Constructif',
                            'Matériau structurel',
                            'Décalage supérieur',
                            'Nom',
                            '011EC_Lot',
                            'Longueur',
                            #'Famille et type'
                            ]
dfs_concat[selected_feature_list].sample(2)

Unnamed: 0,013EC_Localisation,Décalage inférieur,012EC_Ouvrage,014EC_Mode Constructif,Matériau structurel,Décalage supérieur,Nom,011EC_Lot,Longueur
9525,CES,7.623e-13,CES,CES,TPFI_BOIS,0.02,TPFI-FOB-34,FACADE,2.22036
7014,INTERIEUR,0.0,MUR,MACONNERIE,.EC.R Béton TOUS chantier,-0.22,EIF_STR_Voile non-porteur 18cm,GO,1.379994


In [50]:
df_clean = dfs_concat[selected_feature_list].dropna(axis=0, how='any')
df_clean.shape

(13508, 9)

In [51]:
#ignoring '011EC_Lot' for now / all GO
X = df_clean.drop(columns = ['012EC_Ouvrage','013EC_Localisation','014EC_Mode Constructif','011EC_Lot'])
y = df_clean[['012EC_Ouvrage','013EC_Localisation','014EC_Mode Constructif']]
y.shape, X.shape

((13508, 3), (13508, 5))

In [52]:
X

Unnamed: 0,Décalage inférieur,Matériau structurel,Décalage supérieur,Nom,Longueur
0,0.0,ECSA - Béton Voiles,0.0,Voile BA 20,6.200000
1,0.0,ECSA - Béton Voiles,0.0,Voile BA 20,2.907734
2,0.0,ECSA - Béton Voiles,0.0,Voile BA 20,3.780000
3,0.0,ECSA - Béton Voiles,0.0,Voile BA 20,3.780000
4,0.0,ECSA - Béton Voiles,0.0,Voile BA 20,4.850000
...,...,...,...,...,...
13628,0.0,<Par catégorie>,0.0,ECRL_MOB_EP15cm,12.790926
13629,0.0,<Par catégorie>,0.0,ECRL_MOB_EP15cm,14.375674
13630,0.0,<Par catégorie>,0.0,ECRL_MUR_EP20cm,7.319774
13631,0.0,<Par catégorie>,0.0,ECRL_MUR_EP20cm,3.199963


In [53]:
y

Unnamed: 0,012EC_Ouvrage,013EC_Localisation,014EC_Mode Constructif
0,MUR,INTERIEUR,POUTRE VOILE
1,MUR,INTERIEUR,POUTRE VOILE
2,MUR,INTERIEUR,BANCHE
3,MUR,INTERIEUR,BANCHE
4,MUR,INTERIEUR,POUTRE VOILE
...,...,...,...
13628,MUR,EXTERIEUR,MOB
13629,MUR,EXTERIEUR,MOB
13630,MUR,INTERIEUR,MACONNERIE
13631,MUR,EXTERIEUR,MACONNERIE


In [54]:
for feat in X.columns:
    print(f'the uniques for {feat}: {len(X[feat].unique())}')


the uniques for Décalage inférieur: 343
the uniques for Matériau structurel: 25
the uniques for Décalage supérieur: 532
the uniques for Nom: 143
the uniques for Longueur: 7750


In [55]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
ohe.fit(X[['Matériau structurel']])
ohe.categories_

[array(['.EC.R Béton TOUS chantier', '<Par catégorie>', 'BRIQUES',
        'BRIQUES RAL 1013', 'Blocs béton manufacturés', 'BÉTON', 'C25/30',
        'C25/30 - béton standard', 'C25/30 - peaux extérieures prémurs',
        'C25/30 - éléments préfa', 'EC - Béton', 'ECSA - Béton Voiles',
        'EIF_STR - BETON', 'GOE-Béton Coulé sur Place',
        'GOE-Béton Coulé sur Place-VNP',
        'GOE-Béton Coulé sur place-1ere PHASE', 'GOE-Parpaings Creux',
        'GOE-Parpaings Pleins', 'Maçonnerie',
        'Maçonnerie - Parpaing creux', 'Maçonnerie - Parpaing plein',
        'Maçonnerie - Voile BA', 'TPFI_BA', 'TPFI_BOIS', 'Verre'],
       dtype=object)]

In [147]:
feats = ['Matériau structurel','Famille et type']
for feat in feats:
    print(f'Transforming {feat}: {len(X[feat].unique())} values')
    ohe = OneHotEncoder(sparse_output=False)
    ohe.fit(X[[feat]])
    X[ohe.get_feature_names_out()] = ohe.transform(X[[feat]])
    X.drop(columns=[feat], inplace=True)


Transforming Matériau structurel: 6 values
Transforming Famille et type: 30 values


In [148]:
X.shape

(519, 40)

In [149]:
y['012EC_Ouvrage'].unique()

array(['POTEAU RECTANGULAIRE', 'POTEAU PARTICULIER', 'POTEAU CIRCULAIRE',
       'MUR'], dtype=object)

In [150]:
labeler2 = {'POTEAU RECTANGULAIRE':0,
           'POTEAU PARTICULIER':1,
           'POTEAU CIRCULAIRE':2,
           'MUR':3,
}
y['ouvrage'] = y['012EC_Ouvrage'].apply(lambda x: labeler2[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['ouvrage'] = y['012EC_Ouvrage'].apply(lambda x: labeler2[x])


In [151]:
y['013EC_Localisation'].unique()

array(['INTERIEUR', 'EXTERIEUR', 'COURANT', 'BASSINS PERIPHERIES'],
      dtype=object)

In [152]:
labeler2 = {'INTERIEUR':0,
           'EXTERIEUR':1,
           'COURANT':2,
           'BASSINS PERIPHERIES':3,
}
y['local'] = y['013EC_Localisation'].apply(lambda x: labeler2[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['local'] = y['013EC_Localisation'].apply(lambda x: labeler2[x])


In [153]:
y['014EC_Mode Constructif'].unique()

array(['BANCHE', 'PREFA CHANTIER', 'PREFA USINE', 'CARTON'], dtype=object)

In [154]:
y['014EC_Mode Constructif'].unique()
labeler1 = {'BANCHE':0,
           'PREFA CHANTIER':1,
           'PREFA USINE':2,
           'CARTON':3,
       }
y['const'] = y['014EC_Mode Constructif'].apply(lambda x: labeler1[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['const'] = y['014EC_Mode Constructif'].apply(lambda x: labeler1[x])


In [139]:
y.sample(1)

Unnamed: 0,012EC_Ouvrage,013EC_Localisation,014EC_Mode Constructif,local,Ouvrage,construc
515,POTEAU RECTANGULAIRE,INTERIEUR,CARTON,0,0,3


In [164]:
from sklearn.dummy import DummyClassifier


dummy_clf_local = DummyClassifier(strategy="stratified")
dummy_clf_local.fit(X, y.local)


In [165]:
set(dummy_clf_local.predict(X))


{0, 1, 2, 3}

In [None]:
dummy_clf_local.score(X, y.local)

0.49710982658959535

In [191]:
dummy_clf_construc = DummyClassifier(strategy="stratified")
dummy_clf_construc.fit(X, y.const)

In [192]:
set(dummy_clf_construc.predict(X))

{0, 1, 2, 3}

In [193]:
dummy_clf_construc.score(X, y.const)

0.35645472061657035

In [180]:
dummy_clf_ouvrage = DummyClassifier(strategy="stratified")
dummy_clf_ouvrage.fit(X, y.ouvrage)

In [187]:
y_pred = dummy_clf_ouvrage.predict(X)

In [189]:
dummy_clf_ouvrage.score(y_pred, y.const)

0.35260115606936415