In [2]:
import pandas as pd
import numpy as np
import joblib
import unicodedata
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# ================== Chargement des données ==================
data = pd.read_csv("/Users/hadilneji/Downloads/pollution.csv", sep=';', parse_dates=['date'])
data = data.drop(columns=['CO', 'NOX', 'date'])

# ================== Normalisation des noms de zones ==================
def normalize_text(text):
    # Corrige les encodages bizarres et supprime accents
    text = str(text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    text = text.strip().upper()
    return text

data['ville'] = data['ville'].apply(normalize_text)

# Vérif des noms uniques après normalisation
print("\nNoms de zones normalisés :")
print(data['ville'].unique())

# ================== Encodage LabelEncoder ==================
le = LabelEncoder()
data['ville'] = le.fit_transform(data['ville'])

# Sauvegarde du LabelEncoder
joblib.dump(le, "ville_encoder.pkl")

# ================== Création colonnes cibles ==================
data['AQI_3h'] = data['AQI'].shift(-3)
data['AQI_6h'] = data['AQI'].shift(-6)
data = data.dropna()

# ================== Features & targets ==================
features = [
    'PM10','PM25','NO2','NO','ville',
    'year','month','day','heure',
    'temperature','vent','pluie','humidite','weekend'
]
X = data[features]
y_3h = data['AQI_3h']
y_6h = data['AQI_6h']

# ================== MODELE 3h ==================
X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y_3h, test_size=0.2, random_state=42)
model_3h = RandomForestRegressor(n_estimators=200, random_state=42)
model_3h.fit(X_train3, y_train3)
pred_3h = model_3h.predict(X_test3)

# Sauvegarde modèle en .pkl
joblib.dump(model_3h, "aqi_model_3h.pkl")

# ================== MODELE 6h ==================
X_train6, X_test6, y_train6, y_test6 = train_test_split(X, y_6h, test_size=0.2, random_state=42)
model_6h = RandomForestRegressor(n_estimators=200, random_state=42)
model_6h.fit(X_train6, y_train6)
pred_6h = model_6h.predict(X_test6)

# Sauvegarde modèle en .pkl
joblib.dump(model_6h, "aqi_model_6h.pkl")

# Exemple de prédiction future
sample = X_test3.iloc[[0]]
print("\nExemple de prédictions :")
print("  AQI prévu dans 3h :", model_3h.predict(sample)[0])
print("  AQI prévu dans 6h :", model_6h.predict(sample)[0])



Noms de zones normalisés :
['AUTOROUTE A1 - SAINT-DENIS' 'AVENUE DES CHAMPS ELYSEES'
 'BOULEVARD HAUSSMANN' 'BOULEVARD PERIPHERIQUE EST' 'GENNEVILLIERS'
 'PARIS 18EME' 'ROUTE NATIONALE 6 - MELUN']

Exemple de prédictions :
  AQI prévu dans 3h : 24.542347734709367
  AQI prévu dans 6h : 48.90993514434685
