# PBM: кластеризация на эталоне и прогноз по префиксу

Этот ноутбук строит карту поведения и кластеры на эталонной выборке скважин, а затем относит новые скважины к кластерам и прогнозирует хвост профиля только по соседям выбранного кластера.

In [None]:
import os
import glob
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tools.preprocessing import preprocess_profiles, PreprocConfig
from tools.manifold import embed_umap_euclid, ManifoldConfig
from tools.clustering import cluster_hdbscan, ClusterConfig
from tools.forecast import (
    build_prefix_scaled_channel,
    make_matrices,
    vote_cluster_by_prefix,
    knn_forecast,
    evaluate_forecasts,
)

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", 40)
plt.style.use("seaborn-v0_8")

## Шаг 0. Конфигурация входных данных и параметров

In [None]:
data_folder = "data/wells"

# если нужно зафиксировать конкретный список скважин для прогноза — укажите здесь
forecast_wells_manual = []  # например: ["WELL_001", "WELL_002"]
holdout_fraction = 0.10  # если список пуст — доля скважин для прогноза выбирается случайно

T_total = 100  # горизонт после предобработки
T_pref = 20    # длина префикса для прогноза

umap_cfg = ManifoldConfig(n_neighbors=30, min_dist=0.05, n_components=2, random_state=43)
cluster_cfg = ClusterConfig(min_cluster_size=50, min_samples=12)
K_vote = 7      # число соседей для голосования кластеров по префиксу
K_forecast = 15  # число соседей для прогноза хвоста

rng = np.random.default_rng(42)

## Шаг 1. Загрузка и первичная очистка данных

In [None]:
all_csv = sorted(glob.glob(os.path.join(data_folder, "*.csv")))
if not all_csv:
    raise FileNotFoundError(f"В папке {data_folder!r} не найдено CSV-файлов.")

# объединяем данные из всех файлов
df_raw = pd.concat((pd.read_csv(path) for path in all_csv), ignore_index=True)
df_raw = df_raw.sort_values(["well_name", "date"]).reset_index(drop=True)

# отбрасываем заведомо не добывающие записи
df_raw = df_raw[(df_raw["oil"] >= 0) & (df_raw["gas"] >= 0) & (df_raw["water"] >= 0)]
print(f"Raw dataframe shape: {df_raw.shape}")
df_raw.head()

## Шаг 1а. Предобработка профилей

In [None]:
preproc_cfg = PreprocConfig(T=T_total)
out = preprocess_profiles(df_raw, preproc_cfg)

panel_long = out["panel_long"]
X_tensor = out["X"]
wells_used = out["wells_used"]
tensor_channels = out["tensor_channels"]
T_total = int(out["config"]["T"])

print(f"Всего скважин после фильтрации: {len(wells_used)}")
panel_long.head()

## Шаг 2. Деление на эталонную и прогнозную выборки

In [None]:
if forecast_wells_manual:
    forecast_wells = sorted(set(forecast_wells_manual))
else:
    n_forecast = max(1, int(len(wells_used) * holdout_fraction))
    forecast_wells = sorted(rng.choice(wells_used, size=n_forecast, replace=False))

reference_wells = [w for w in wells_used if w not in forecast_wells]

print(f"Reference wells: {len(reference_wells)}")
print(f"Forecast wells: {len(forecast_wells)}")
if len(reference_wells) < 10:
    print("Предупреждение: очень мало эталонных скважин — проверьте параметры выборки.")

## Шаг 3. Manifold UMAP на эталонных скважинах

In [None]:
ref_indices = np.array([i for i, w in enumerate(wells_used) if w in reference_wells])
if ref_indices.size == 0:
    raise ValueError("Нет эталонных скважин для построения manifold.")

X_ref = X_tensor[ref_indices]

Z_ref, umap_model = embed_umap_euclid(
    X_ref,
    tensor_channels,
    channels=umap_cfg.channels,
    n_neighbors=umap_cfg.n_neighbors,
    min_dist=umap_cfg.min_dist,
    n_components=umap_cfg.n_components,
    random_state=umap_cfg.random_state,
)

wells_ref = [wells_used[i] for i in ref_indices]
df_umap = pd.DataFrame({"well_name": wells_ref, "x": Z_ref[:, 0], "y": Z_ref[:, 1]})
df_umap.head()

In [None]:
plt.figure(figsize=(6, 5))
plt.scatter(df_umap["x"], df_umap["y"], s=20, alpha=0.7)
plt.title("UMAP на эталонной выборке")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.grid(True, alpha=0.2)
plt.show()

## Шаг 4. Кластеризация HDBSCAN

In [None]:
cluster_res = cluster_hdbscan(Z_ref, wells_ref, cfg=cluster_cfg)
df_map = cluster_res["df_map"]
print(
    f"Silhouette={cluster_res['silhouette']:.3f}, DBCV={cluster_res['dbcv']:.3f}"
)
df_map.head()

In [None]:
cluster_counts = (
    df_map.groupby("cluster").size().rename("count").reset_index().sort_values("count", ascending=False)
)
cluster_counts.head()

## Шаг 5. Префикс-канал и матрицы для прогноза

In [None]:
panel_long_pref = build_prefix_scaled_channel(
    panel_long,
    wells_used,
    T=T_total,
    T_pref=T_pref,
    rate_col="r_oil_s",
    out_col="r_oil_pref_norm",
)

X_pref, Y_suffix_true, Y_full = make_matrices(
    panel_long_pref,
    wells_used,
    T=T_total,
    T_pref=T_pref,
    channel="r_oil_pref_norm",
    target_col="r_oil_s",
)

print(f"X_pref shape: {X_pref.shape}, Y_full shape: {Y_full.shape}")

## Шаг 6. Отнесение новых скважин к кластерам по префиксу

In [None]:
forecast_indices = [i for i, w in enumerate(wells_used) if w in forecast_wells]

vote_df, vote_details = vote_cluster_by_prefix(
    X_pref,
    wells_used,
    df_map,
    target_indices=forecast_indices,
    K_vote=K_vote,
    allow_noise=True,
)
vote_df

In [None]:
cluster_summary = pd.DataFrame({"well_name": wells_used})
cluster_summary["role"] = np.where(
    cluster_summary["well_name"].isin(reference_wells), "reference", "forecast"
)
cluster_summary = cluster_summary.merge(
    df_map[["well_name", "cluster", "prob"]].rename(
        columns={"cluster": "cluster_ref", "prob": "cluster_prob"}
    ),
    on="well_name",
    how="left",
)
cluster_summary = cluster_summary.merge(
    vote_df.rename(columns={"cluster_vote": "cluster_vote", "vote_conf": "vote_conf"}),
    on="well_name",
    how="left",
)
cluster_summary["cluster_ref"] = pd.to_numeric(
    cluster_summary["cluster_ref"], errors="coerce"
).astype("Int64")
cluster_summary["cluster_vote"] = pd.to_numeric(
    cluster_summary["cluster_vote"], errors="coerce"
).astype("Int64")
cluster_summary["cluster_final"] = cluster_summary.apply(
    lambda row: row["cluster_ref"] if row["role"] == "reference" else row["cluster_vote"],
    axis=1,
)
cluster_summary["cluster_final"] = pd.to_numeric(
    cluster_summary["cluster_final"], errors="coerce"
).astype("Int64")
cluster_summary["confidence"] = cluster_summary.apply(
    lambda row: row["cluster_prob"] if row["role"] == "reference" else row["vote_conf"],
    axis=1,
)
cluster_summary.head()

In [None]:
cluster_summary.groupby(["role", "cluster_final"], dropna=False).size()

## Шаг 7. Пулы соседей по кластерам

In [None]:
mask_full = np.isfinite(Y_full).sum(axis=1) >= Y_full.shape[1]
train_idx = np.where(mask_full)[0]

cluster_to_train = {}
for idx in train_idx:
    cluster_val = cluster_summary.loc[idx, "cluster_final"]
    if pd.notna(cluster_val):
        cluster_to_train.setdefault(int(cluster_val), []).append(int(idx))

candidate_pools = {}
for idx, cluster_val in cluster_summary["cluster_final"].items():
    if pd.isna(cluster_val):
        continue
    pool = cluster_to_train.get(int(cluster_val), [])
    if pool:
        candidate_pools[int(idx)] = pool

print(f"Всего тренировочных скважин с полным горизонтом: {len(train_idx)}")

## Шаг 8. Прогноз хвоста по соседям выбранного кластера

In [None]:
target_indices = sorted(set(train_idx.tolist()) | set(forecast_indices))

Y_pred_knn, knn_info = knn_forecast(
    X_pref,
    Y_full,
    T_pref=T_pref,
    K=K_forecast,
    target_indices=target_indices,
    candidate_pools=candidate_pools,
)

metrics_knn = evaluate_forecasts(Y_suffix_true, Y_pred_knn)
print("KNN metrics:")
print(json.dumps(metrics_knn, indent=2, ensure_ascii=False))

## Шаг 9. Просмотр прогнозов для новых скважин

In [None]:
suffix_cols = [f"m{t}" for t in range(T_pref + 1, T_total + 1)]
forecast_table = pd.DataFrame(Y_pred_knn[forecast_indices], columns=suffix_cols)
forecast_table.insert(0, "well_name", [wells_used[i] for i in forecast_indices])
forecast_table.head()

In [None]:
def plot_forecast(well_name: str):
    if well_name not in wells_used:
        raise ValueError("Unknown well name")
    idx = wells_used.index(well_name)
    t_axis = np.arange(T_total)

    plt.figure(figsize=(8, 4))
    y_true = Y_full[idx]
    plt.plot(t_axis[:T_pref], y_true[:T_pref], label="observed prefix", linewidth=2)
    if np.isfinite(y_true[T_pref:]).any():
        plt.plot(t_axis[T_pref:], y_true[T_pref:], label="true suffix", alpha=0.6)
    plt.plot(t_axis[T_pref:], Y_pred_knn[idx], label="knn forecast", linewidth=2)
    plt.axvline(T_pref - 1, color="gray", linestyle="--", linewidth=1)
    plt.title(f"Well: {well_name}")
    plt.xlabel("month index")
    plt.ylabel("oil rate")
    plt.legend()
    plt.grid(True, alpha=0.2)
    plt.show()

# пример: посмотрим на первую прогнозную скважину (при наличии)
if forecast_wells:
    plot_forecast(forecast_wells[0])

In [None]:
if forecast_indices:
    idx = forecast_indices[0]
    well_name = wells_used[idx]
    print(f"Cluster vote for {well_name}:")
    print(vote_details.get(idx, {}))
    print("
Neighbors used in forecast:")
    print(knn_info["neighbors"].get(idx, {}))