<a href="https://colab.research.google.com/github/emokid1337/Ai_project/blob/main/code_ai_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import datetime as dt

import plotly.express as px
import plotly.graph_objects as go
from IPython.display import HTML

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error,
    r2_score,
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit

print("Python:", sys.version)
print("TensorFlow:", tf.__version__)

# ============================================================
# КОНФИГ
# ============================================================

CONFIG = {
    "data_filename": "Bakery.csv",
    "date_col": "DateTime",
    "item_col": "Items",
    "txn_col": "TransactionNo",

    "start_date": None,
    "end_date": None,

    "time_freq": "h",            # почасовая агрегация, 'h' вместо 'H'
    "min_count": 0,

    "use_hour": True,
    "use_dayofweek": True,
    "use_month": True,
    "use_daypart": True,
    "use_daytype": True,
    "use_item_one_hot": True,

    "target_log1p": True,

    "model_type": "dense",       # "dense" или "lstm"
    "hidden_units": [128, 64],
    "dropout_rate": 0.2,
    "learning_rate": 1e-3,

    "batch_size": 512,
    "epochs": 30,
    "early_stopping_patience": 5,

    # окно для LSTM (количество временных шагов в последовательности)
    "sequence_length": 24,

    "test_size": 0.15,           # доля теста в конце ряда
    "project_title": "Прогноз спроса на товары пекарни по дате и времени",
    "html_report_name": "bakery_demand_forecasting_report.html",
}

# ============================================================
# ЗАГРУЗКА ДАННЫХ
# ============================================================

if not os.path.exists(CONFIG["data_filename"]):
    raise FileNotFoundError(
        f"Файл {CONFIG['data_filename']} не найден.\n"
        f"Поместите Bakery.csv рядом с main.py."
    )

df_raw = pd.read_csv(CONFIG["data_filename"])
print("Первые строки исходного датасета:")
print(df_raw.head())
print(df_raw.info())

required_cols = [CONFIG["txn_col"], CONFIG["item_col"], CONFIG["date_col"]]
for c in required_cols:
    if c not in df_raw.columns:
        raise ValueError(f"В файле нет обязательного столбца '{c}'.")

df = df_raw.copy()

df[CONFIG["date_col"]] = pd.to_datetime(df[CONFIG["date_col"]], errors="coerce")
df = df.dropna(subset=[CONFIG["date_col"]])

if CONFIG["start_date"] is not None:
    df = df[df[CONFIG["date_col"]] >= pd.to_datetime(CONFIG["start_date"])]
if CONFIG["end_date"] is not None:
    df = df[df[CONFIG["date_col"]] <= pd.to_datetime(CONFIG["end_date"])]

print("Размер после фильтрации:", df.shape)

# ============================================================
# АГРЕГАЦИЯ СПРОСА
# ============================================================

freq = CONFIG["time_freq"]
df["ds"] = df[CONFIG["date_col"]].dt.floor(freq)

grouped = (
    df.groupby(["ds", CONFIG["item_col"]])
      .size()
      .reset_index(name="y")
)
grouped = grouped[grouped["y"] >= CONFIG["min_count"]].reset_index(drop=True)

print("Агрегированные данные (по времени и товару):")
print(grouped.head())

# ============================================================
# ИНЖЕНЕРИЯ ПРИЗНАКОВ
# ============================================================

g = grouped.copy()

g["year"] = g["ds"].dt.year
g["month"] = g["ds"].dt.month
g["day"] = g["ds"].dt.day
g["dayofweek"] = g["ds"].dt.dayofweek
g["hour"] = g["ds"].dt.hour

feature_cols_num = []
if CONFIG["use_hour"] and freq == "h":
    feature_cols_num.append("hour")
if CONFIG["use_dayofweek"]:
    feature_cols_num.append("dayofweek")
if CONFIG["use_month"]:
    feature_cols_num.append("month")

meta = df.copy()
meta["ds"] = meta[CONFIG["date_col"]].dt.floor(freq)

if "Daypart" in meta.columns and CONFIG["use_daypart"]:
    daypart_map = (
        meta.groupby(["ds", CONFIG["item_col"]])["Daypart"]
            .first()
            .reset_index()
    )
    g = g.merge(daypart_map, on=["ds", CONFIG["item_col"]], how="left")

if "DayType" in meta.columns and CONFIG["use_daytype"]:
    daytype_map = (
        meta.groupby(["ds", CONFIG["item_col"]])["DayType"]
            .first()
            .reset_index()
    )
    g = g.merge(daytype_map, on=["ds", CONFIG["item_col"]], how="left")

cat_cols = []
if CONFIG["use_daypart"] and "Daypart" in g.columns:
    dp_dummies = pd.get_dummies(g["Daypart"], prefix="daypart")
    g = pd.concat([g, dp_dummies], axis=1)
    cat_cols.extend(list(dp_dummies.columns))

if CONFIG["use_daytype"] and "DayType" in g.columns:
    dt_dummies = pd.get_dummies(g["DayType"], prefix="daytype")
    g = pd.concat([g, dt_dummies], axis=1)
    cat_cols.extend(list(dt_dummies.columns))

if CONFIG["use_item_one_hot"]:
    item_dummies = pd.get_dummies(g[CONFIG["item_col"]], prefix="item")
    g = pd.concat([g, item_dummies], axis=1)
    cat_cols.extend(list(item_dummies.columns))

all_feature_cols = feature_cols_num + cat_cols

for col in feature_cols_num:
    g[col] = g[col].astype("float32")

if CONFIG["target_log1p"]:
    g["y_transformed"] = np.log1p(g["y"])
    target_col = "y_transformed"
else:
    target_col = "y"

print("Используемые признаки:", all_feature_cols)
print("Целевая переменная:", target_col)

# ============================================================
# train / val / test по времени (без shuffle)
# ============================================================

g = g.sort_values("ds").reset_index(drop=True)

n_total = len(g)
n_test = int(n_total * CONFIG["test_size"])
n_train_val = n_total - n_test

df_train_val = g.iloc[:n_train_val].copy()
df_test = g.iloc[n_train_val:].copy()

# для train/val используем TimeSeriesSplit (но фактически берём последний сплит как val)
tscv = TimeSeriesSplit(n_splits=5)
last_train_index, last_val_index = None, None
for train_index, val_index in tscv.split(df_train_val):
    last_train_index, last_val_index = train_index, val_index

df_train = df_train_val.iloc[last_train_index].copy()
df_val = df_train_val.iloc[last_val_index].copy()

print("Размеры выборок:",
      "train:", df_train.shape,
      "val:", df_val.shape,
      "test:", df_test.shape)

X_train = df_train[all_feature_cols].values.astype("float32")
X_val = df_val[all_feature_cols].values.astype("float32")
X_test = df_test[all_feature_cols].values.astype("float32")

y_train = df_train[target_col].values.astype("float32")
y_val = df_val[target_col].values.astype("float32")
y_test = df_test[target_col].values.astype("float32")

# ============================================================
# НОРМАЛИЗАЦИЯ ПРИЗНАКОВ (StandardScaler)
# ============================================================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# ============================================================
# Подготовка данных для LSTM (окна по времени)
# ============================================================

def build_lstm_sequences(X, y, seq_len):
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_len + 1):
        X_seq.append(X[i:i+seq_len])
        y_seq.append(y[i+seq_len-1])
    return np.array(X_seq, dtype="float32"), np.array(y_seq, dtype="float32")

sequence_length = CONFIG["sequence_length"]

if CONFIG["model_type"] == "lstm":
    # здесь важен порядок во времени, поэтому используем train/val/test уже отсортированные.
    X_train_input, y_train_input = build_lstm_sequences(X_train_scaled, y_train, sequence_length)
    X_val_input, y_val_input = build_lstm_sequences(X_val_scaled, y_val, sequence_length)
    X_test_input, y_test_input = build_lstm_sequences(X_test_scaled, y_test, sequence_length)
else:
    X_train_input, y_train_input = X_train_scaled, y_train
    X_val_input, y_val_input = X_val_scaled, y_val
    X_test_input, y_test_input = X_test_scaled, y_test

# ============================================================
# МОДЕЛИ
# ============================================================

def build_dense_model(input_dim: int) -> keras.Model:
    inputs = keras.Input(shape=(input_dim,))
    x = inputs
    for units in CONFIG["hidden_units"]:
        x = layers.Dense(units, activation="relu")(x)
        if CONFIG["dropout_rate"] > 0:
            x = layers.Dropout(CONFIG["dropout_rate"])(x)
    outputs = layers.Dense(1, activation="linear")(x)
    return keras.Model(inputs=inputs, outputs=outputs, name="dense_bakery_demand")

def build_lstm_model_correct(input_dim: int, sequence_length: int = 24) -> keras.Model:
    """Правильная LSTM для временных рядов"""
    inputs = keras.Input(shape=(sequence_length, input_dim))
    x = layers.LSTM(128, return_sequences=True)(inputs)
    x = layers.LSTM(64, return_sequences=False)(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(1, activation="linear")(x)
    return keras.Model(inputs=inputs, outputs=outputs, name="lstm_bakery_demand")

if CONFIG["model_type"] == "dense":
    input_dim = X_train_input.shape[1]
    model = build_dense_model(input_dim)
elif CONFIG["model_type"] == "lstm":
    input_dim = X_train_input.shape[2]
    model = build_lstm_model_correct(input_dim, sequence_length=sequence_length)
else:
    raise ValueError("model_type должен быть 'dense' или 'lstm'.")

optimizer = keras.optimizers.Adam(learning_rate=CONFIG["learning_rate"])
model.compile(optimizer=optimizer, loss="mse", metrics=["mae"])

print("Сводка модели:")
model.summary()

# ============================================================
# ОБУЧЕНИЕ
# ============================================================

callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=CONFIG["early_stopping_patience"],
        restore_best_weights=True,
    )
]

history = model.fit(
    X_train_input,
    y_train_input,
    validation_data=(X_val_input, y_val_input),
    epochs=CONFIG["epochs"],
    batch_size=CONFIG["batch_size"],
    callbacks=callbacks,
    verbose=1,
)

# ============================================================
# ОЦЕНКА И МЕТРИКИ
# ============================================================

y_test_pred = model.predict(X_test_input).flatten()

# для LSTM мы уже сдвинули y_test_input, и именно их надо использовать
y_test_model_space = y_test_input

if CONFIG["target_log1p"]:
    y_test_real = np.expm1(y_test_model_space)
    y_pred_real = np.expm1(y_test_pred)
else:
    y_test_real = y_test_model_space
    y_pred_real = y_test_pred

def calculate_all_metrics(y_true, y_pred):
    metrics = {
        "MAE": mean_absolute_error(y_true, y_pred),
        "MSE": mean_squared_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAPE": mean_absolute_percentage_error(y_true, y_pred) * 100,
        "R2": r2_score(y_true, y_pred),
    }
    return metrics

metrics = calculate_all_metrics(y_test_real, y_pred_real)
print("Метрики на тесте:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

# ============================================================
# СБОР РЕЗУЛЬТАТОВ В df_test_results (по последним временным шагам)
# ============================================================

# Для Dense проще: 1 к 1. Для LSTM нужно сопоставить с последними датами.
if CONFIG["model_type"] == "dense":
    df_test_results = df_test.copy()
    df_test_results["y_true"] = y_test_real
    df_test_results["y_pred"] = y_pred_real
else:
    # в LSTM X_test_input[i] соответствует окну, заканчивающемуся в df_test.iloc[i + seq_len - 1]
    seq_len = sequence_length
    df_test_trimmed = df_test.iloc[seq_len - 1:].copy()
    df_test_results = df_test_trimmed.copy()
    df_test_results["y_true"] = y_test_real
    df_test_results["y_pred"] = y_pred_real

df_test_results.to_csv("bakery_test_predictions.csv", index=False)

# ============================================================
# ВИЗУАЛИЗАЦИИ
# ============================================================

# 1) График обучения
hist_df = pd.DataFrame(history.history)
hist_df["epoch"] = np.arange(len(hist_df))

fig_loss = px.line(
    hist_df,
    x="epoch",
    y=["loss", "val_loss"],
    title="Динамика функции потерь (MSE) на обучении/валидации",
    labels={"value": "Loss", "variable": "Тип", "epoch": "Эпоха"},
)

# 2) Топовый товар: факт vs прогноз
top_items = (
    df_test_results.groupby(CONFIG["item_col"])["y_true"]
    .sum()
    .sort_values(ascending=False)
    .index
    .tolist()
)

focus_item = top_items[0] if len(top_items) > 0 else None

if focus_item is not None:
    df_item = df_test_results[df_test_results[CONFIG["item_col"]] == focus_item]
    fig_item = go.Figure()
    fig_item.add_trace(go.Scatter(
        x=df_item["ds"],
        y=df_item["y_true"],
        name=f"{focus_item} (факт)",
        mode="lines+markers"
    ))
    fig_item.add_trace(go.Scatter(
        x=df_item["ds"],
        y=df_item["y_pred"],
        name=f"{focus_item} (прогноз)",
        mode="lines+markers"
    ))
    fig_item.update_layout(
        title=f"Факт vs прогноз спроса для товара: {focus_item}",
        xaxis_title="Время",
        yaxis_title="Количество продаж"
    )
else:
    fig_item = go.Figure()

# 3) Средний спрос по часам
if CONFIG["time_freq"] == "h":
    df_test_results["hour"] = df_test_results["ds"].dt.hour
    df_hour = (
        df_test_results.groupby("hour")[["y_true", "y_pred"]]
        .mean()
        .reset_index()
    )
    fig_hour = go.Figure()
    fig_hour.add_trace(go.Bar(
        x=df_hour["hour"],
        y=df_hour["y_true"],
        name="Факт"
    ))
    fig_hour.add_trace(go.Bar(
        x=df_hour["hour"],
        y=df_hour["y_pred"],
        name="Прогноз"
    ))
    fig_hour.update_layout(
        barmode="group",
        title="Средний спрос по часам суток: факт vs прогноз",
        xaxis_title="Час",
        yaxis_title="Среднее количество продаж"
    )
else:
    fig_hour = go.Figure()

# 4) Распределение ошибок
fig_errors = px.histogram(
    x=y_test_real - y_pred_real,
    title="Распределение ошибок прогнозирования",
    labels={"x": "Ошибка (факт - прогноз)"}
)

# 5) Анализ по топ‑5 товарам
fig_all_items = go.Figure()
for item in top_items[:5]:
    df_i = df_test_results[df_test_results[CONFIG["item_col"]] == item]
    fig_all_items.add_trace(go.Scatter(
        x=df_i["ds"],
        y=df_i["y_true"],
        name=f"{item} (факт)",
        mode="lines"
    ))
    fig_all_items.add_trace(go.Scatter(
        x=df_i["ds"],
        y=df_i["y_pred"],
        name=f"{item} (прогноз)",
        mode="lines",
        line=dict(dash="dash")
    ))

fig_all_items.update_layout(
    title="Факт vs прогноз по топ‑5 товарам",
    xaxis_title="Время",
    yaxis_title="Количество продаж"
)

# (опционально) показать графики при запуске скрипта
fig_loss.show()
fig_item.show()
fig_hour.show()
fig_errors.show()
fig_all_items.show()

# ============================================================
# HTML‑ОТЧЁТ
# ============================================================

def fig_to_html_div(fig):
    return fig.to_html(include_plotlyjs=False, full_html=False)

loss_div = fig_to_html_div(fig_loss)
item_div = fig_to_html_div(fig_item)
hour_div = fig_to_html_div(fig_hour)
errors_div = fig_to_html_div(fig_errors)
all_items_div = fig_to_html_div(fig_all_items)

html_title = CONFIG["project_title"]
html_metrics = "<ul>" + "".join(
    f"<li><b>{k}</b>: {v:.4f}</li>" for k, v in metrics.items()
) + "</ul>"

html_text_intro = """
<h2>Описание задачи</h2>
<p>
Прогнозирование количества продаж товаров пекарни по временным и календарным признакам.
</p>
"""

html_text_features = f"""
<h2>Инженерия признаков и нормализация</h2>
<ul>
  <li>Календарные признаки: месяц, день недели, час.</li>
  <li>Категориальные: Daypart, DayType, товар.</li>
  <li>One‑hot кодирование категориальных признаков.</li>
  <li>StandardScaler для всех признаков (ноль‑среднее, единичное стандартное отклонение).</li>
  <li>Цель: агрегированное количество продаж за интервал ({CONFIG["time_freq"]}), лог‑трансформация log1p при обучении.</li>
</ul>
"""

html_text_model = f"""
<h2>Модель и валидация</h2>
<ul>
  <li>Тип модели: {CONFIG["model_type"].upper()} (Dense или LSTM с оконным представлением).</li>
  <li>Валидация по времени: TimeSeriesSplit (без перемешивания).</li>
  <li>Ранняя остановка по val_loss с patience={CONFIG["early_stopping_patience"]}.</li>
</ul>
"""

html_body = f"""
<html>
<head>
  <meta charset="utf-8">
  <title>{html_title}</title>
  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
  <style>
    body {{ font-family: Arial, sans-serif; margin: 20px; }}
    h1 {{ color: #333; }}
    h2 {{ color: #444; margin-top: 30px; }}
  </style>
</head>
<body>
  <h1>{html_title}</h1>
  {html_text_intro}
  {html_text_features}
  {html_text_model}
  <h2>Качество модели на тесте</h2>
  {html_metrics}
  <h2>График обучения (loss)</h2>
  {loss_div}
  <h2>Прогноз по популярному товару</h2>
  {item_div}
  <h2>Средний спрос по часам суток</h2>
  {hour_div}
  <h2>Распределение ошибок</h2>
  {errors_div}
  <h2>Факт vs прогноз для топ‑5 товаров</h2>
  {all_items_div}
</body>
</html>
"""

report_path = CONFIG["html_report_name"]
with open(report_path, "w", encoding="utf-8") as f:
    f.write(html_body)

print(f"HTML‑отчёт сохранён в файл: {report_path}")


Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
TensorFlow: 2.19.0
Первые строки исходного датасета:
   TransactionNo          Items             DateTime  Daypart  DayType
0              1          Bread  2016-10-30 09:58:11  Morning  Weekend
1              2   Scandinavian  2016-10-30 10:05:34  Morning  Weekend
2              2   Scandinavian  2016-10-30 10:05:34  Morning  Weekend
3              3  Hot chocolate  2016-10-30 10:07:57  Morning  Weekend
4              3            Jam  2016-10-30 10:07:57  Morning  Weekend
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20507 entries, 0 to 20506
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   TransactionNo  20507 non-null  int64 
 1   Items          20507 non-null  object
 2   DateTime       20507 non-null  object
 3   Daypart        20507 non-null  object
 4   DayType        20507 non-null  object
dtypes: int64(1), object(4)
memory usage: 801.2+ KB
N

Epoch 1/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 79ms/step - loss: 1.1877 - mae: 0.8046 - val_loss: 0.6550 - val_mae: 0.4297
Epoch 2/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.5159 - mae: 0.5195 - val_loss: 0.3724 - val_mae: 0.3319
Epoch 3/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.3437 - mae: 0.4228 - val_loss: 0.1366 - val_mae: 0.2679
Epoch 4/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.3200 - mae: 0.4025 - val_loss: 0.1230 - val_mae: 0.2638
Epoch 5/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 0.2759 - mae: 0.3758 - val_loss: 0.1302 - val_mae: 0.2566
Epoch 6/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.2385 - mae: 0.3582 - val_loss: 0.1331 - val_mae: 0.2625
Epoch 7/30
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 0

HTML‑отчёт сохранён в файл: bakery_demand_forecasting_report.html
