# 02 - Feature Engineering

**Étapes:**
1. Charger les données brutes
2. Nettoyer (supprimer NaN et valeurs saturées)
3. Créer les features (temporelles + ratios)
4. Encoder la saison (one-hot)
5. Préparer le dataset submission

In [None]:
import sys
sys.path.append('..')

from src.paths import (
    WATER_QUALITY_FILE, LANDSAT_FILE, TERRACLIMATE_FILE,
    SUBMISSION_TEMPLATE, LANDSAT_SUBMISSION_FILE, TERRACLIMATE_SUBMISSION_FILE
)
from src.config import TARGETS, ALL_FEATURES, LANDSAT_FEATURES
from src.data.load_data import load_all, load_submission
from src.features import prepare_training, prepare_submission, MODEL_FEATURES, select_model_features
from src.visualization import plot_correlation

## 1. Charger les données brutes

In [None]:
_, _, _, df_raw = load_all(
    str(WATER_QUALITY_FILE),
    str(LANDSAT_FILE),
    str(TERRACLIMATE_FILE),
    features=ALL_FEATURES,
    fill_na=False
)

## 2. Préparer les données training

In [None]:
# Pipeline: nettoie + crée features + encode season
df_train, medians = prepare_training(df_raw)

print(f"\nDataset training: {len(df_train)} lignes")
print(f"Colonnes: {list(df_train.columns)}")

## 3. Corrélations

In [None]:
# Corrélation features Landsat vs targets
plot_correlation(df_train, LANDSAT_FEATURES)

In [None]:
# Corrélation nouvelles features vs targets
plot_correlation(df_train, ['day_of_year', 'nir_green_ratio', 'swir_ratio'])

## 4. Préparer les données submission

In [None]:
# Charger submission brut
_, df_sub_raw = load_submission(
    str(SUBMISSION_TEMPLATE),
    str(LANDSAT_SUBMISSION_FILE),
    str(TERRACLIMATE_SUBMISSION_FILE),
    features=ALL_FEATURES,
    fill_na=False
)

# Pipeline: impute + crée features + encode season
df_submission = prepare_submission(df_sub_raw, medians)

print(f"\nDataset submission: {len(df_submission)} lignes")

## 5. Résumé

In [None]:
print("Features pour le modèle:")
print(MODEL_FEATURES)

print(f"\nTraining: {len(df_train)} lignes")
print(f"Submission: {len(df_submission)} lignes")