In [1]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib as plt


In [2]:
base_dir = "../../raw_data/"


In [3]:
def import_csv(tableau, files):
    dfs = []

    for i in files:
        csv_path = os.path.join(base_dir, f"maquette{i}", f"{tableau}{i}.csv")
        if os.path.exists(csv_path):
            # Cherche la ligne d'en-tête réelle
            with open(csv_path, encoding="utf-8") as f:
                for idx, line in enumerate(f):
                    if line.startswith("Id;"):
                        header_row = idx
                        break
            try:
                df = pd.read_csv(csv_path, sep=';', decimal=",", header=header_row)
                dfs.append(df)
                print(f"Chargé : {csv_path} ({df.shape[0]} lignes, {df.shape[1]} colonnes)")
                #print(df.head())
            except Exception as e:
                print(f"Erreur de parsing : {csv_path} -> {e}")
        else:
            print(f"Fichier non trouvé : {csv_path}")

    if dfs:
        dfs_concat = pd.concat(dfs, ignore_index=True)
        print(f"Total concaténé : {dfs_concat.shape[0]} lignes, {dfs_concat.shape[1]} colonnes")
    else:
        dfs_concat = pd.DataFrame()
        print("Aucun fichier murs.csv trouvé.")

    return dfs_concat


In [None]:
files = [1,2,3,4,5,6,11,12,13,14,15,16,17,18,19,20]
dfs_concat = import_csv('murs', files)

In [None]:
percent_missing = dfs_concat.isnull().sum() * 100 / len(dfs_concat)

missing_value_df = pd.DataFrame({'column_name': dfs_concat.columns,
                                 'percent_missing': percent_missing})

In [None]:
feature_list = missing_value_df.sort_values('percent_missing',
                                            ascending=False)['column_name'].to_list()

In [None]:
selected_features_list = [
    "011EC_Lot",
    "012EC_Ouvrage",
    "013EC_Localisation",
    "014EC_Mode Constructif",
    "Epaisseur",
    "Sols en intersection",
    "Sols coupés (u)",
    "Sols coupants (u)",
    "Sol au-dessus",
    "Sol en-dessous",
    "Fenêtres",
    "Portes",
    "Ouvertures",
    #"Murs imbriqués",
    "Mur multicouche",
    "Profil modifié",
    #"Extension inférieure",
    #"Extension supérieure",
    "Partie inférieure attachée",
    "Partie supérieure attachée",
    "Décalage supérieur",
    "Décalage inférieur",
    "Matériau structurel",
    "Famille et type"
]

targets = [
    "011ec_lot",
    "012ec_ouvrage",
    "013ec_localisation",
    "014ec_mode_constructif"
]

df_clean = dfs_concat[selected_features_list].dropna(axis=0, how='any')
df_clean.shape

In [None]:
#ignoring '011EC_Lot' for now / all GO
X = df_clean.drop(columns = ['012EC_Ouvrage','013EC_Localisation','014EC_Mode Constructif','011EC_Lot'])
y = df_clean[['012EC_Ouvrage','013EC_Localisation','014EC_Mode Constructif','011EC_Lot']]
y.shape, X.shape

In [None]:
for feat in X.columns:
    print(f'the uniques for {feat}: {len(X[feat].unique())}')

In [None]:
for feat in X.columns:
    print(f'the uniques for {feat}: {len(X[feat].unique())} || They are: \n {X[feat].unique()}')

In [None]:
#Boolean to 0/1 for
bool_feats = ['Sol au-dessus', 'Sol en-dessous','Mur multicouche','Profil modifié']
for feat in bool_feats:
    X[feat] = X[feat].apply(lambda x: 1 if x == True else 0)

# SCALING

In [None]:
#features to scale:
feats_to_scale = ['Epaisseur','Décalage supérieur','Décalage inférieur']

#minmax scaling X features
from sklearn.preprocessing import StandardScaler, MinMaxScaler

#std_scaler = StandardScaler()

for feat in feats_to_scale:
    minmax_scaler = MinMaxScaler()
    X[feat] = minmax_scaler.fit_transform(X[[feat]])