A réaliser : 
- Une analyse descriptive des données, y compris une explication du sens des colonnes gardées, des arguments derrière la suppression de lignes ou de colonnes, des statistiques descriptives et des visualisations pertinentes.

## Import des modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:

bc_after_eda = pd.read_csv("data/bc_after_eda.csv", index_col='Unnamed: 0')
bc_after_eda

## Import des modules 

In [None]:
#Selection
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV, 
    cross_validate,
)
from sklearn.metrics import r2_score, mean_absolute_error , root_mean_squared_error, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline

#Preprocess
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler,FunctionTransformer

#Modèles
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


## Feature Engineering

A réaliser : Enrichir le jeu de données actuel avec de nouvelles features issues de celles existantes. 

### Préparation des features pour la modélisation

A réaliser :
* Si ce n'est pas déjà fait, supprimer toutes les colonnes peu pertinentes pour la modélisation.
* Tracer la distribution de la cible pour vous familiariser avec l'ordre de grandeur. En cas d'outliers, mettez en place une démarche pour les supprimer.
* Débarrassez-vous des features redondantes en utilisant une matrice de corrélation.
* Réalisez différents graphiques pour comprendre le lien entre vos features et la target (boxplots, scatterplots, pairplot si votre nombre de features numériques n'est pas très élevé).
*  Séparez votre jeu de données en un Pandas DataFrame X (ensemble de feautures) et Pandas Series y (votre target).
* Si vous avez des features catégorielles, il faut les encoder pour que votre modèle fonctionne.

#### Modes énergétiques

In [None]:
bc_after_eda['UseGas'] = (bc_after_eda['NaturalGas(kBtu)'].notna()) & (bc_after_eda['NaturalGas(kBtu)'] != 0)
bc_after_eda['UseSteam'] = (bc_after_eda['SteamUse(kBtu)'].notna()) & (bc_after_eda['SteamUse(kBtu)'] != 0)
bc_after_eda['UseElectricity'] = (bc_after_eda['Electricity(kBtu)'].notna()) & (bc_after_eda['Electricity(kBtu)'] != 0)


#### distance du centre ville

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    # Entrées: Series -> conversion séparée
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    R = 3958.8
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c


In [None]:
seattle_lat, seattle_lon = 47.6085965, -122.5049456

bc_after_eda['CityDistance'] = haversine(
    bc_after_eda['Latitude'], bc_after_eda['Longitude'],
    seattle_lat, seattle_lon
).round(2)
bc_after_eda['CityDistance'].describe()

#### Utilisation multiple

In [None]:
bc_after_eda['MultipleUseType'] = bc_after_eda['ListOfAllPropertyUseTypes'].str.count('s')+1
bc_after_eda['MultipleUseType'].value_counts()

In [None]:
bc_after_eda.columns

## Split train/test

In [None]:
predict_values = ['3LargestGFA', 'FirstUseType','SecondLargestPropertyUseType', 'MultipleUseType', 'UseSteam', 'UseElectricity', 'UseGas',
       'NumberofFloors', 'NumberofBuildings', 'CityDistance', 'Neighborhood','YearBuilt']
X = bc_after_eda[predict_values]
target = 'SiteEnergyUse(kBtu)'
y = bc_after_eda[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("Index X_train avant pipeline :", X_train.index.equals(y_train.index))


## Finalisation des features

In [None]:
# ========================
# ÉTAPE 1: PREPROCESSING PERSONNALISÉ 
# ========================

def fix_floors_and_discretize(df):
    """Fonction qui fait tout votre preprocessing d'un coup"""
    df = df.copy()
    
    # 1. Corriger NumberofFloors
    mask = (df['NumberofFloors'] < 1)
    OneBuildingMeanFloor = df[df['NumberofBuildings']==1]["NumberofFloors"].mean()
    OneBuildingMeanFloor = int(OneBuildingMeanFloor.round(0))
    df.loc[mask,'NumberofFloors'] = OneBuildingMeanFloor
    
    # 2. AgeProperty et AgeCategory
    df['AgeProperty'] = 2016 - df['YearBuilt']
    df['AgeCategory'] = pd.cut(df['AgeProperty'],
                               bins=[0, 20, 40, 70, df['AgeProperty'].max()],
                               labels=['Neuf', 'Récent', 'Ancien', 'Historique'],
                               include_lowest=True)
    
    # 3. EnergyEra
    df['EnergyEra'] = pd.cut(df['YearBuilt'],
                             bins=[1900, 1980, 2000, 2016],
                             labels=['Pre-Crisis', 'Modern', 'Contemporary'],
                             include_lowest=True)
    
    
    
    # 4. PropertySize (3LargestGFA) – quantiles auto sur train
    if not hasattr(fix_floors_and_discretize, 'size_bins'):
        _, fix_floors_and_discretize.size_bins = pd.qcut(
            df['3LargestGFA'], q=4, retbins=True, duplicates='drop'
        )
    df['PropertySize'] = pd.cut(df['3LargestGFA'],
                                bins=fix_floors_and_discretize.size_bins,
                                labels=['Small', 'Mid', 'Large', 'XLarge'],
                                include_lowest=True)
    
    # 5. HeightCategory (NumberofFloors) – quantiles auto sur train
    if not hasattr(fix_floors_and_discretize, 'floor_bins'):
        _, fix_floors_and_discretize.floor_bins = pd.qcut(
            df['NumberofFloors'], q=3, retbins=True, duplicates='drop'
        )
    df['HeightCategory'] = pd.cut(df['NumberofFloors'],
                                  bins=fix_floors_and_discretize.floor_bins,
                                  labels=['Low', 'Mid', 'High'],
                                  include_lowest=True)
    return df

# ========================
# ÉTAPE 2: PIPELINE COMPLET
# ========================

# Colonnes après votre preprocessing
categorical_features = ['FirstUseType', 'SecondLargestPropertyUseType', 'PropertySize',
                       'Neighborhood','AgeCategory','EnergyEra','HeightCategory']  # Ajoutez vos autres catégories ici

numerical_features = ['3LargestGFA',
                     'CityDistance', 'MultipleUseType', 'NumberofFloors','NumberofBuildings']

# Pipeline complet
full_pipeline = Pipeline([
    # Étape 1: Preprocessing personnalisé
    ('preprocessing', FunctionTransformer(fix_floors_and_discretize, validate=False)),
    
    # Étape 2: Encodage + Normalisation
    ('encoder', ColumnTransformer([
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), 
         categorical_features),
        ('num', StandardScaler(), numerical_features)
    ], remainder='passthrough'))
])

# ========================
# ÉTAPE 3: APPLICATION 
# ========================

# Fit sur train et transform train/test
X_train_transformed = full_pipeline.fit_transform(X_train)
X_test_transformed = full_pipeline.transform(X_test)

# ========================
# ÉTAPE 4: CRÉER VOS DataFrames _final
# ========================

# Récupérer les noms des colonnes
onehot = full_pipeline.named_steps['encoder'].named_transformers_['cat']
onehot_names = onehot.get_feature_names_out(categorical_features)
num_names = [f"scaled_{col}" for col in numerical_features]

# Colonnes restantes (passthrough)
all_cols_after_preprocessing = fix_floors_and_discretize(X_train).columns
remaining_cols = [col for col in all_cols_after_preprocessing 
                 if col not in categorical_features + numerical_features]

# Noms finaux
final_feature_names = list(onehot_names) + num_names + remaining_cols

# Vos DataFrames finaux
X_train_final = pd.DataFrame(X_train_transformed, columns=final_feature_names,index=X_train.index )
X_test_final = pd.DataFrame(X_test_transformed, columns=final_feature_names,index=X_test.index)

# Conversion en numérique
for col in X_train_final.columns:
    X_train_final[col] = pd.to_numeric(X_train_final[col], errors='coerce')
    X_test_final[col] = pd.to_numeric(X_test_final[col], errors='coerce')

X_train_final.drop(columns=['YearBuilt','AgeProperty'], inplace=True)
X_test_final.drop(columns=['YearBuilt','AgeProperty'], inplace=True)

print(f"✅ Pipeline terminé!")
print(f"Shape finale: Train {X_train_final.shape}, Test {X_test_final.shape}")


In [None]:
print("Index X_train_final après pipeline :", X_train_final.index.equals(y_train.index))

In [None]:
X_train_final.info()

In [None]:
# CODE COMPARAISON DES MODELES
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
models = {
    'DummyRegressor': DummyRegressor(strategy='mean'),
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1)
}

scoring = ['neg_root_mean_squared_error','r2','neg_mean_absolute_error']
cv_results = {}

print("=== COMPARAISON MODÈLES AVEC PIPELINE COMPLET ===")
for name, model in models.items():
    # Pipeline complet + modèle final
    full_estimator = Pipeline([
        ('preprocess', full_pipeline.named_steps['preprocessing']),
        ('encode_scale', full_pipeline.named_steps['encoder']),
        ('model', model)
    ])
    scores = cross_validate(full_estimator, X_train, y_train, 
                            cv=5, scoring=scoring, n_jobs=-1)
    cv_results[name] = {
        'RMSE': -scores['test_neg_root_mean_squared_error'].mean(),
        'R2': scores['test_r2'].mean(),
        'MAE': -scores['test_neg_mean_absolute_error'].mean()
    }
    print(f"{name} → R²: {cv_results[name]['R2']:.3f}, RMSE: {cv_results[name]['RMSE']:.0f}, MAE: {cv_results[name]['MAE']:.0f}")
    
    

In [None]:
choice= "MAE"
if choice == "R2":
    best_model_name = max(cv_results.keys(), key=lambda x: cv_results[x][choice])
else:
    best_model_name = min(cv_results.keys(), key=lambda x: cv_results[x][choice])
print(f"\n🏆 Meilleur modèle selon {choice}: {best_model_name}")

### Optimisation et interprétation du modèle

A réaliser :
* Reprennez le meilleur algorithme que vous avez sécurisé via l'étape précédente, et réalisez une GridSearch de petite taille sur au moins 3 hyperparamètres.
* Si le meilleur modèle fait partie de la famille des modèles à arbres (RandomForest, GradientBoosting) alors utilisez la fonctionnalité feature importance pour identifier les features les plus impactantes sur la performance du modèle. Sinon, utilisez la méthode Permutation Importance de sklearn. 

In [None]:
# Supposons best_model_name défini précédemment
#best_model_name = "RandomForest"
print(f"=== OPTIMISATION de {best_model_name} ===")

if best_model_name == 'RandomForest':
    estimator = RandomForestRegressor(random_state=42, n_jobs=-1)
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

elif best_model_name == 'GradientBoosting':
    estimator = GradientBoostingRegressor(random_state=42)
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.1, 0.05, 0.01],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    }

elif best_model_name == 'LinearRegression':
    estimator = LinearRegression()
    param_grid = {
        'fit_intercept': [True, False],
        'positive': [False, True]
    }

elif best_model_name == 'SVR':
    estimator = SVR()
    param_grid = {
        'kernel': ['rbf', 'linear', 'poly'],
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto']
    }

else:
    raise ValueError(f"Modèle non supporté : {best_model_name}")

# Lancer GridSearchCV
gs = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

gs.fit(X_train_final, y_train)

print("Meilleurs paramètres :", gs.best_params_)
print("Meilleur score CV (RMSE) :", -gs.best_score_)

# Évaluation finale
final_model = gs.best_estimator_
y_pred = final_model.predict(X_test_final)

print("\n=== PERFORMANCE FINALE sur TEST ===")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.0f}")
print(f"R²: {r2_score(y_test, y_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.0f}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred):.4f}")


=== OPTIMISATION de LinearRegression ===  
Fitting 5 folds for each of 4 candidates, totalling 20 fits  
Meilleurs paramètres : {'fit_intercept': False, 'positive': True}  
Meilleur score CV (RMSE) : 15216932.25998593  
  
=== PERFORMANCE FINALE sur TEST ===  
RMSE: 9128882  
R²: 0.7541  
MAE: 4289626  
MAPE: 1.4691  

In [None]:
best_model = gs.best_estimator_

importances = best_model.feature_importances_
feature_names = X_train_final.columns

# Affichage du top 15 features les plus importantes
sorted_idx = importances.argsort()[::-1]
print("Top 30 features by importance:")
for idx in sorted_idx[:30]:
    print(f"- {feature_names[idx]}: {importances[idx]:.4f}")

# Sauvegarde BentoML

In [None]:
import bentoml
bentoml.sklearn.save_model(
    name="building_energy_rf_pipeline",
    model=full_pipeline,
    signatures={"predict": {"batchable": True}},
    metadata={
        "author": "day811",
        "feature_names": list(feature_names)
    },
    
)

In [None]:
print(list(X_train_final.columns))