In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

In [9]:
import os

base_dir = "/Users/fabiancordenod/code/fqbq69/BIMpredict-/raw_data"

dfs = []
for i in range(1, 7):
    murs_path = os.path.join(base_dir, f"maquette{i}", f"murs{i}.csv")
    if os.path.exists(murs_path):
        with open(murs_path, encoding="utf-8") as f:
            for idx, line in enumerate(f):
                if line.startswith("Id;"):
                    header_row = idx
                    break
        try:
            df = pd.read_csv(murs_path, sep=';', header=header_row)
            dfs.append(df)
            print(f"Chargé : {murs_path} ({df.shape[0]} lignes, {df.shape[1]} colonnes)")
            print(df.head())
        except Exception as e:
            print(f"Erreur de parsing : {murs_path} -> {e}")
    else:
        print(f"Fichier non trouvé : {murs_path}")

if dfs:
    murs_concat = pd.concat(dfs, ignore_index=True)
    print(f"Total concaténé : {murs_concat.shape[0]} lignes, {murs_concat.shape[1]} colonnes")
else:
    murs_concat = pd.DataFrame()
    print("Aucun fichier murs.csv trouvé.")

murs_concat.head()

Chargé : /Users/fabiancordenod/code/fqbq69/BIMpredict-/raw_data/maquette1/murs1.csv (1589 lignes, 146 colonnes)
       Id 011EC_Lot 012EC_Ouvrage 013EC_Localisation 014EC_Mode Constructif  \
0  779920        GO           MUR          INTERIEUR           POUTRE VOILE   
1  780111        GO           MUR          INTERIEUR           POUTRE VOILE   
2  780661        GO           MUR          INTERIEUR                 BANCHE   
3  783007        GO           MUR          INTERIEUR                 BANCHE   
4  783554        GO           MUR          INTERIEUR           POUTRE VOILE   

           Nom           Hauteur Epaisseur                            AI  \
0  Voile BA 20  2,88000000000138       0,2  -0,0000000000013788124467113   
1  Voile BA 20  2,88000000000138       0,2  -0,0000000000013788124467113   
2  Voile BA 20  2,88000000000138       0,2  -0,0000000000013788124467113   
3  Voile BA 20  2,88000000000138       0,2  -0,0000000000013788124467113   
4  Voile BA 20  2,88000000000138 

Unnamed: 0,Id,011EC_Lot,012EC_Ouvrage,013EC_Localisation,014EC_Mode Constructif,Nom,Hauteur,Epaisseur,AI,AS,...,EIF_STR - Impact,EC_Type de Mur,ID MONTAGE,Désignation système,Réf DT,Nature_Ouvrage,Batiment,Mur armé,Affichage poteau,NIVEAU_STRUCTURE
0,779920,GO,MUR,INTERIEUR,POUTRE VOILE,Voile BA 20,288000000000138,2,-13788124467113,288,...,,,,,,,,,,
1,780111,GO,MUR,INTERIEUR,POUTRE VOILE,Voile BA 20,288000000000138,2,-13788124467113,288,...,,,,,,,,,,
2,780661,GO,MUR,INTERIEUR,BANCHE,Voile BA 20,288000000000138,2,-13788124467113,288,...,,,,,,,,,,
3,783007,GO,MUR,INTERIEUR,BANCHE,Voile BA 20,288000000000138,2,-13788124467113,288,...,,,,,,,,,,
4,783554,GO,MUR,INTERIEUR,POUTRE VOILE,Voile BA 20,288000000000138,2,-13788124467113,288,...,,,,,,,,,,


In [None]:
target_col = "011EC_Lot"
X = df.drop(columns=[target_col])
y = df[target_col]

# 3. Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Détection automatique des types de variables
num_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object','category']).columns.tolist()

# 5. Préprocessing
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

# 6. Choix du modèle
# --> Basculer entre classification et regression ici :
task_type = "classification"  # ou "regression"

if task_type == "classification":
    model = RandomForestClassifier(random_state=42)
    scoring = accuracy_score
else:
    model = RandomForestRegressor(random_state=42)
    scoring = mean_squared_error

# 7. Pipeline complet
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)
])

# 8. Entraînement
pipeline.fit(X_train, y_train)

# 9. Évaluation
y_pred = pipeline.predict(X_test)

if task_type == "classification":
    score = scoring(y_test, y_pred)
    print(f"Accuracy: {score:.4f}")
else:
    score = scoring(y_test, y_pred, squared=False)  # RMSE
    print(f"RMSE: {score:.4f}")

KeyError: "['nom_de_la_cible'] not found in axis"