In [14]:
import pandas as pd
import numpy as np 
file1 = r"data_after_feature_eng_v1.csv"
# Read CSV files into DataFrames

# Read CSV files into DataFrames
df_full = pd.read_csv(file1)
# 3. Tri pour le groupby
df_full.sort_values(['variant_id', 'checkout_completed_at'], inplace=True)


In [2]:
# 1) Nom de la colonne contenant la date
DATE_COL = "checkout_completed_at"   # adapte si ta colonne s’appelle autrement, p. ex. "date"

# 2) Conversion explicite en datetime (sécurise le type)
df_full[DATE_COL] = pd.to_datetime(df_full[DATE_COL], errors="coerce")  # errors="coerce" mettra NaT si format invalide

# 3) Extraction des bornes
date_min = df_full[DATE_COL].min()
date_max = df_full[DATE_COL].max()

print("📅 Date minimale :", date_min)
print("📅 Date maximale :", date_max)


📅 Date minimale : 2024-01-01 00:00:00
📅 Date maximale : 2025-07-20 00:00:00


### Feature Engineering: Creating predictive variables from raw data


In [3]:
import pandas as pd
import numpy as np
from functools import reduce

# -----------------------------
# 0. Paramètres généraux
# -----------------------------
START = pd.Timestamp('2024-07-20')
END   = pd.Timestamp('2025-07-20')

# -----------------------------
# 1. Préparer / filtrer df_full
# -----------------------------
df_full['checkout_completed_at'] = pd.to_datetime(df_full['checkout_completed_at'])

df_full = (
    df_full
      .loc[(df_full['checkout_completed_at'] >= START) &
           (df_full['checkout_completed_at'] <= END)]
      .copy()
      .sort_values(['variant_id', 'checkout_completed_at'])
)

# -----------------------------
# 2. Calendrier des événements
# -----------------------------
period = pd.date_range(START, END, freq='D')          # index journalier

event_masks = {
    'is_ramadan'       : period.isin(pd.date_range('2025-03-01','2025-03-30')),
    'is_aid_el_fitr'   : period == pd.Timestamp('2025-03-30'),
    'is_aid_el_adha'   : period == pd.Timestamp('2025-06-06'),
    'is_public_holiday': period.isin(pd.to_datetime([
        '2024-07-30','2024-08-20','2024-11-18',
        '2025-01-01','2025-01-11','2025-05-01'
    ])),
    'is_back_to_school': period.isin(pd.date_range('2024-08-25','2024-09-15'))
}

events_df = pd.DataFrame(event_masks, index=period).astype(int)

# Merge sur la date
df_full = (
    df_full
      .merge(events_df, how='left',
             left_on='checkout_completed_at',
             right_index=True)
      .fillna(0)
)

df_full['has_event'] = df_full[list(event_masks)].sum(axis=1).gt(0).astype(int)

# -----------------------------
# 3. Saisonnalité (jour de l’année)
# -----------------------------
df_full['day_of_year'] = df_full['checkout_completed_at'].dt.dayofyear
df_full['sin_doy'] = np.sin(2*np.pi*df_full['day_of_year']/365)
df_full['cos_doy'] = np.cos(2*np.pi*df_full['day_of_year']/365)

# -----------------------------
# 4. Lags, rolling & changements
# -----------------------------
# Lags classiques
for lag in [1, 2, 3, 7, 14]:
    df_full[f'lag_{lag}'] = (
        df_full.groupby('variant_id')['total_demand']
               .shift(lag)
    )

# Rolling means (après un décalage d’un jour pour éviter la fuite d’info)
shifted_td = df_full.groupby('variant_id')['total_demand'].shift(1)

for window in [7, 14, 28]:
    df_full[f'rolling_mean_{window}'] = (
        shifted_td
          .rolling(window)
          .mean()
          .reset_index(level=0, drop=True)
    )

# Écart‑type & médiane sur 7 jours
df_full['rolling_std_7'] = (
    shifted_td.rolling(7).std().reset_index(level=0, drop=True)
)
df_full['rolling_median_7'] = (
    shifted_td.rolling(7).median().reset_index(level=0, drop=True)
)

# Pourcentages de changement
df_full['pct_change_1'] = (
    df_full.groupby('variant_id')['total_demand']
           .pct_change(1, fill_method=None)
)
df_full['pct_change_7'] = (
    df_full.groupby('variant_id')['total_demand']
           .pct_change(7, fill_method=None)
)

# -----------------------------
# 5. Moyennes historiques jour/mois
# -----------------------------
df_full['day_of_week'] = df_full['checkout_completed_at'].dt.dayofweek
df_full['month']       = df_full['checkout_completed_at'].dt.month

df_full['dow_demand_avg'] = (
    df_full.groupby(['variant_id','day_of_week'])['total_demand']
           .transform('mean')
)
df_full['month_demand_avg'] = (
    df_full.groupby(['variant_id','month'])['total_demand']
           .transform('mean')
)

# -----------------------------
# 6. Ratio de zéros sur 7 jours
# -----------------------------
df_full['zero_demand_7d'] = (
    df_full.groupby('variant_id')['total_demand']
           .shift(1).rolling(7)
           .apply(lambda x: np.mean(x==0), raw=True)
           .reset_index(level=0, drop=True)
)

# Moyenne glissante 7 jours (min 3 obs.)
df_full['demand_7day_avg'] = (
    df_full.groupby('variant_id')['total_demand']
           .transform(lambda x: x.shift(1).rolling(7, min_periods=3).mean())
)

# -----------------------------
# 7. Détection d’outliers (IQR)
# -----------------------------
def detect_outliers_iqr(df, columns):
    outlier_indices = {}
    for col in columns:
        Q1, Q3 = df[col].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
        outliers = df[(df[col] < lower) | (df[col] > upper)]
        outlier_indices[col] = outliers.index.tolist()
        print(f"🔍 {col}: {len(outliers)} outliers détectés")
    return outlier_indices

numeric_cols = [
    'total_demand','demand_7day_avg',
    'lag_1','lag_2','lag_3','lag_7','lag_14',
    'pct_change_1','pct_change_7',
    'rolling_mean_7','rolling_mean_14','rolling_mean_28'
]

outliers_detected = detect_outliers_iqr(df_full, numeric_cols)

all_outlier_indices = reduce(
    set.union,
    (set(v) for v in outliers_detected.values()),
    set()
)
print(f"🗑️ Total lignes à supprimer (outliers) : {len(all_outlier_indices)}")


🔍 total_demand: 104143 outliers détectés
🔍 demand_7day_avg: 69469 outliers détectés
🔍 lag_1: 73115 outliers détectés
🔍 lag_2: 70446 outliers détectés
🔍 lag_3: 68124 outliers détectés
🔍 lag_7: 60855 outliers détectés
🔍 lag_14: 62983 outliers détectés
🔍 pct_change_1: 74751 outliers détectés
🔍 pct_change_7: 55739 outliers détectés
🔍 rolling_mean_7: 26474 outliers détectés
🔍 rolling_mean_14: 21408 outliers détectés
🔍 rolling_mean_28: 14412 outliers détectés
🗑️ Total lignes à supprimer (outliers) : 293431


In [4]:
# 2. FILTRE magasin n°25
df_full = df_full[df_full['store_id'] == 25].copy()


In [6]:
df=df_full

In [7]:
import pandas as pd
import numpy as np 


# 0) LIBRAIRIES
# ---------------------------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping



# Vérification essentielle
required_cols = ["variant_id", "checkout_completed_at",
                 "total_demand", "stockout_occurred"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Colonnes manquantes : {missing}")

# ---------------------------------------------
# 2) AGRÉGATION JOURNALIÈRE + FEATURES
# ---------------------------------------------
df["date"] = df["checkout_completed_at"].dt.date
daily = (df.groupby(["variant_id", "date"])
           .agg(total_demand=("total_demand", "sum"),
                rupture=("stockout_occurred", "max"))
           .reset_index()
           .sort_values(["variant_id", "date"]))

feat_all = []
for vid, g in daily.groupby("variant_id"):
    g = g.copy()
    g["demand"] = g["total_demand"]
    g["rupture_past"] = g["rupture"].shift(1, fill_value=0)
    g["taux_rupture_7j"] = g["rupture"].rolling(7, 1).mean()
    moy = g["demand"].rolling(7, 1).mean()
    g["tendance_demand"] = g["demand"] / (moy + 1)
    feat_all.append(g)

data = pd.concat(feat_all, ignore_index=True)
feature_cols = ["demand_scaled", "rupture_past",
                "taux_rupture_7j", "tendance_demand"]

# ---------------------------------------------
# 3) SÉQUENCES (fenêtre 14 j, horizon 7 j)
# ---------------------------------------------
seq_len, horizon = 14, 7
X, y = [], []
scalers = {}

for vid, g in data.groupby("variant_id"):
    if len(g) < seq_len + horizon:
        continue
    scaler = MinMaxScaler()
    g["demand_scaled"] = scaler.fit_transform(g[["demand"]])
    scalers[vid] = scaler

    for i in range(len(g) - seq_len - horizon + 1):
        X_seq = g.iloc[i:i+seq_len][feature_cols].values
        futur = g.iloc[i+seq_len:i+seq_len+horizon]["rupture"].values
        X.append(X_seq)
        y.append(1 if futur.max() else 0)

X, y = np.array(X), np.array(y)
print(f"{len(X):,} séquences créées | % rupture = {y.mean():.1%}")

# ---------------------------------------------
# 4) MODÈLE LSTM LÉGER
# ---------------------------------------------
model = Sequential([
    LSTM(32, input_shape=(seq_len, len(feature_cols))),
    Dropout(0.3),
    Dense(16, activation="relu"),
    Dense(1,  activation="sigmoid")
])
model.compile(optimizer=Adam(1e-3),
              loss="binary_crossentropy",
              metrics=["accuracy"])

# Gestion du déséquilibre
cw = compute_class_weight("balanced", classes=np.unique(y), y=y)
class_w = {0: cw[0], 1: cw[1]}

early = EarlyStopping(patience=2, monitor="val_loss",
                      restore_best_weights=True)

model.fit(X, y,
          epochs=12,
          batch_size=128,
          validation_split=0.2,
          class_weight=class_w,
          shuffle=True,
          callbacks=[early],
          verbose=1)

# ---------------------------------------------
# 5) PRÉDICTION 1-7 juillet 2025
# ---------------------------------------------
start = pd.Timestamp("2025-07-11").date()
preds = []

for vid, g in data.groupby("variant_id"):
    if vid not in scalers or len(g) < seq_len:
        continue
    last = g.tail(seq_len).copy()
    last["demand_scaled"] = scalers[vid].transform(last[["demand"]])
    X_pred = last[feature_cols].values.reshape(1, seq_len, len(feature_cols))
    p = float(model.predict(X_pred, verbose=0)[0][0])

    if p >= .7:  lvl = "TRES_HAUT"
    elif p >= .5: lvl = "HAUT"
    elif p >= .3: lvl = "MOYEN"
    else:         lvl = "FAIBLE"

    for d in range(horizon):
        preds.append({"variant_id": vid,
                      "date": start + pd.Timedelta(days=d),
                      "rupture_probability": round(p, 4),
                      "risk_level": lvl})

pred_df = pd.DataFrame(preds)


104,362 séquences créées | % rupture = 46.1%


  super().__init__(**kwargs)


Epoch 1/12
[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 17ms/step - accuracy: 0.6297 - loss: 0.6500 - val_accuracy: 0.6746 - val_loss: 0.5904
Epoch 2/12
[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.6486 - loss: 0.6315 - val_accuracy: 0.6746 - val_loss: 0.5901
Epoch 3/12
[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.6508 - loss: 0.6288 - val_accuracy: 0.6755 - val_loss: 0.5868
Epoch 4/12
[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.6528 - loss: 0.6277 - val_accuracy: 0.6779 - val_loss: 0.5869
Epoch 5/12
[1m653/653[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.6514 - loss: 0.6299 - val_accuracy: 0.6768 - val_loss: 0.5892


In [11]:
# ---------------------------------------------
# 5) PRÉDICTION 1-7 juillet 2025
# ---------------------------------------------
start = pd.Timestamp("2025-07-21").date()
preds = []

for vid, g in data.groupby("variant_id"):
    if vid not in scalers or len(g) < seq_len:
        continue
    last = g.tail(seq_len).copy()
    last["demand_scaled"] = scalers[vid].transform(last[["demand"]])
    X_pred = last[feature_cols].values.reshape(1, seq_len, len(feature_cols))
    p = float(model.predict(X_pred, verbose=0)[0][0])

    if p >= .7:  lvl = "TRES_HAUT"
    elif p >= .5: lvl = "HAUT"
    elif p >= .3: lvl = "MOYEN"
    else:         lvl = "FAIBLE"

    for d in range(horizon):
        preds.append({"variant_id": vid,
                      "date": start + pd.Timedelta(days=d),
                      "rupture_probability": round(p, 4),
                      "risk_level": lvl})

pred_df = pd.DataFrame(preds)

In [13]:
pred_df.to_csv("predictions_rup_20__juillet.csv", index=False)
print("✅ Fichier enregistré : predictions_rup_1_7_juillet.csv")

✅ Fichier enregistré : predictions_rup_1_7_juillet.csv
