In [1]:
import pandas as pd
import glob
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")
os.environ["SCIPY_ARRAY_API"] = "1"

In [2]:
# путь к папке
folder = "data"

# ищем все csv-файлы
all_csv = glob.glob(os.path.join(folder, "*.csv"))

# читаем и объединяем
df_0 = pd.concat((pd.read_csv(f) for f in all_csv), ignore_index=True)
print(f"Считано файлов: {len(all_csv)}")
# переименуем для удобства
df_0 = df_0.rename(columns={"BBLS_OIL_COND":"oil", "MCF_GAS": 'gas', "BBLS_WTR":"water", "API_WellNo":"well_name", "RptDate":"date", "DAYS_PROD":"days_prod"})
# преобразуем колонку RptDate к datetime
df_0["date"] = pd.to_datetime(df_0["date"])

Считано файлов: 4485


In [3]:
df = df_0.drop(columns=["Lease_Unit", "Formation"])
print(df.shape)

(1212779, 6)


In [4]:
# важно, чтобы внутри каждой скважины ряды шли строго по времени
df = df.sort_values(by=["well_name", "date"]).reset_index(drop=True)

In [5]:
# Есть записи с отрицательными дебитами. Исключим их, так как работа будет вестись только с добывающими скважинами
df = df[(df['oil'] >= 0) & (df['gas'] >= 0) & (df['water'] >= 0)]

In [6]:
df

Unnamed: 0,well_name,date,oil,gas,water,days_prod
0,25-003-05000-00-00,1991-02-28,0,0,0,0.0
1,25-003-05001-00-00,1991-01-31,0,0,0,0.0
2,25-003-05001-00-00,1991-03-31,0,0,0,0.0
3,25-003-05001-00-00,1991-04-30,0,0,0,0.0
4,25-003-05001-00-00,1991-05-31,0,0,0,0.0
...,...,...,...,...,...,...
1212774,25-111-21271-00-00,2025-03-31,0,0,0,0.0
1212775,25-111-21271-00-00,2025-04-30,0,0,0,0.0
1212776,25-111-21271-00-00,2025-05-31,60,16,5505,16.0
1212777,25-111-21271-00-00,2025-06-30,0,0,0,0.0


In [7]:
# Шаг 1: Предобработка
from pbm.preprocess import PreprocConfig, preprocess_profiles


Обнаружен df в окружении. Запускаю предобработку с дефолтным конфигом...
Preprocess complete.
  В итог попало скважин: 2142
  Отброшено (по причинам):


Unnamed: 0,reason,count
0,no_start_detected,1930
1,too_short(<12),51


             well_name       date  oil_sum  water_sum  gas_sum  days_sum  \
0   25-003-05007-00-00 1986-01-01    252.0     2552.0      0.0      31.0   
1   25-003-05007-00-00 1986-02-01    239.0     2295.0      0.0      28.0   
2   25-003-05007-00-00 1986-03-01    253.0     2697.0      0.0      31.0   
3   25-003-05007-00-00 1986-04-01    249.0     1883.0      0.0      30.0   
4   25-003-05007-00-00 1986-05-01    278.0     2631.0      0.0      31.0   
5   25-003-05007-00-00 1986-06-01    244.0     2086.0      0.0      30.0   
6   25-003-05007-00-00 1986-07-01    271.0     2322.0      0.0      31.0   
7   25-003-05007-00-00 1986-09-01    195.0     1625.0      0.0       0.0   
8   25-003-05007-00-00 1986-10-01    224.0     1867.0      0.0       0.0   
9   25-003-05007-00-00 1986-11-01    209.0     1742.0      0.0       0.0   
10  25-003-05007-00-00 1986-12-01    224.0     1867.0      0.0       0.0   
11  25-003-05007-00-00 1987-01-01    221.0     2066.0      0.0       0.0   

    valid_d

In [8]:
# Шаги 2-3: признаки и manifold
from pbm.features import compute_side_features, scale_features
from pbm.embedding import ManifoldConfig, embed_umap_euclid, embed_umap_fastdtw


Side features shape: (2142, 16)


Recompute DTW for 64583 pairs (radius=6): 100%|██████████| 64583/64583 [12:43<00:00, 84.55it/s] 


DTW-UMAP: (2142, 2) subset size: 2142


Unnamed: 0,well_name,x,y
0,25-003-05007-00-00,7.190036,9.426781
1,25-003-05009-00-00,7.079275,10.082273
2,25-003-05057-00-00,7.511279,10.024635
3,25-003-05067-00-00,6.941536,9.197872
4,25-003-05068-00-00,7.686371,10.157837


In [9]:
# Шаг 4: кластеризация и аномалии
from pbm.clustering import (cluster_hdbscan, cluster_gmm_bic, lof_anomaly_scores,
    distance_to_medoid, build_cluster_prototypes, summarize_clusters, assign_anomaly_scores, ClusterConfig)


Silhouette: 0.6330978870391846  DBCV: nan
   cluster  size     share  prob_median
0       -1   662  0.309057     0.000000
1        0    97  0.045285     1.000000
2        1    54  0.025210     1.000000
3        2    85  0.039683     1.000000
4        3    96  0.044818     1.000000
5        4   186  0.086835     0.876985
6        5   120  0.056022     0.942701
7        6   338  0.157796     0.815296
8        7   169  0.078898     0.903724
9        8   335  0.156396     1.000000


In [10]:
# Шаг 5: визуализация и отчёты
from pbm.export import (save_pbm_map, save_cluster_distribution_plot,
    save_cluster_prototype_plots, export_csv_summaries, build_html_report)


Готов отчёт: ./pbm_report_exports/PBM_report.html


In [11]:
panel_long = out["panel_long"]
T = out["config"]["T"]
df_map = res["df_map"]  # из HDBSCAN/GMM
out_dir = "./pbm_report_exports"

map_png   = save_pbm_map(Z_dtw, df_map, out_dir)              # карта PBM
sizes_png = save_cluster_distribution_plot(df_map, out_dir)   # размеры кластеров
proto_pngs = save_cluster_prototype_plots(
    panel_long, df_map, protos, channels=("r_oil_s","wc","gor","r_oil_norm"), T=T, out_dir=out_dir
)

summary = summarize_clusters(df_map)
csvs = export_csv_summaries(df_map, summary, out_dir, top_anoms=50)

report = build_html_report(out_dir, map_png, sizes_png, proto_pngs, df_map, summary, title="PBM Report")
print("Отчёт:", report)

Отчёт: ./pbm_report_exports/PBM_report.html



# Шаг 6. Прогноз профиля по префиксу (20 → 100)

**Цель:** прогнозировать месяцы 21–100 для каждой скважины, используя только первые 20 месяцев и без утечки информации.

Подходы в этом шаге:
1. **KNN-достройка по префиксу** с амплитудным выравниванием соседей (по МНК на префиксе).
2. **Мультивыходная ElasticNet-регрессия** на компактных признаках префикса.

Метрики: RMSE и sMAPE на окне 21–100.  
Отчёты и артефакты сохраняются в `./forecast_exports`.


In [16]:
# Вспомогательные функции прогноза
from pbm.forecast import (build_prefix_scaled_channel, make_matrices, knn_forecast,
    multioutput_forecast, evaluate_forecasts)


In [17]:
# === Шаг 6: Прогноз профиля по префиксу (20 → 100) ===
import os, numpy as np, pandas as pd, json
from datetime import datetime
import matplotlib.pyplot as plt
from pbm.forecast_utils import save_predictions_csv, plot_example

assert 'out' in globals(), "Требуется объект 'out' из Шага 1 (preprocess_profiles)."
panel_long = out["panel_long"].copy()
wells_used = out["wells_used"]
T = int(out["config"]["T"])
T_pref = 20  # можно вынести в конфиг

# 6.1. Построить префикс-нормированный канал без утечки
panel_long = build_prefix_scaled_channel(panel_long, wells_used, T=T, T_pref=T_pref,
                                         q=0.90, rate_col="r_oil_s", out_col="r_oil_pref_norm")

# 6.2. Матрицы X_pref, Y_suffix, Y_full
X_pref, Y_suffix_true, Y_full = make_matrices(panel_long, wells_used, T=T, T_pref=T_pref,
                                              channel="r_oil_pref_norm", target_col="r_oil_s")

# 6.3. KNN-достройка
Y_pred_knn, knn_info = knn_forecast(X_pref, Y_full, T_pref=T_pref, K=15)

# 6.4. Мультивыходная регрессия
Y_pred_lr, lr_info = multioutput_forecast(panel_long, wells_used, T=T, T_pref=T_pref, Y_full=Y_full, random_state=43)

# 6.5. Оценка качества
m_knn = evaluate_forecasts(Y_suffix_true, Y_pred_knn)
m_lr  = evaluate_forecasts(Y_suffix_true, Y_pred_lr)
print("KNN   → RMSE={rmse:.4f}, sMAPE={smape:.4f}, N={n_eval}".format(**m_knn))
print("ENet  → RMSE={rmse:.4f}, sMAPE={smape:.4f}, N={n_eval}".format(**m_lr))

# 6.6. Сохранение прогнозов и отчёта
out_dir = "./forecast_exports"
os.makedirs(out_dir, exist_ok=True)

np.save(os.path.join(out_dir, "Y_suffix_true.npy"), Y_suffix_true)
np.save(os.path.join(out_dir, "Y_pred_knn.npy"), Y_pred_knn)
np.save(os.path.join(out_dir, "Y_pred_enet.npy"), Y_pred_lr)

# Таблица метрик
metrics_df = pd.DataFrame([
    {"model": "knn", "rmse": m_knn["rmse"], "smape": m_knn["smape"], "n_eval": m_knn["n_eval"]},
    {"model": "elasticnet", "rmse": m_lr["rmse"], "smape": m_lr["smape"], "n_eval": m_lr["n_eval"]},
])
metrics_csv = os.path.join(out_dir, "metrics.csv")
metrics_df.to_csv(metrics_csv, index=False)
save_predictions_csv(Y_pred_knn, wells_used, "knn", out_dir, T_pref, T)
save_predictions_csv(Y_pred_lr,  wells_used, "elasticnet", out_dir, T_pref, T)
    df.to_csv(os.path.join(out_dir, f"pred_{name}.csv"), index=False)
save_predictions_csv(Y_pred_lr,  wells_used, "elasticnet")

    return fig_path

# случайные 3 примера из обучаемых скважин
rng = np.random.default_rng(43)
I = np.where(np.isfinite(Y_pred_knn).all(axis=1))[0]
show = rng.choice(I, size=min(3, len(I)), replace=False) if len(I) else []
example_imgs = []
for i in show:
    example_imgs.append(plot_example(i, "knn_example", Y_suffix_true, Y_pred_knn, out_dir, T_pref, T))
    example_imgs.append(plot_example(i, "enet_example", Y_suffix_true, Y_pred_lr, out_dir, T_pref, T))

# 6.8. Простой HTML отчёт
html = f"""
<html><head><meta charset='utf-8'><title>Forecast Report</title></head><body>
<h2>Forecast evaluation (prefix {T_pref} → total {T})</h2>
<p>Generated: {datetime.utcnow().isoformat()}Z</p>
<table border='1' cellspacing='0' cellpadding='6'>
<tr><th>Model</th><th>RMSE</th><th>sMAPE</th><th>N eval wells</th></tr>
<tr><td>KNN</td><td>{m_knn['rmse']:.4f}</td><td>{m_knn['smape']:.4f}</td><td>{m_knn['n_eval']}</td></tr>
<tr><td>ElasticNet</td><td>{m_lr['rmse']:.4f}</td><td>{m_lr['smape']:.4f}</td><td>{m_lr['n_eval']}</td></tr>
</table>
<h3>Files</h3>
<ul>
  <li>metrics.csv</li>
  <li>pred_knn.csv</li>
  <li>pred_elasticnet.csv</li>
</ul>
<h3>Examples</h3>
{''.join(f"<img src='{os.path.basename(p)}' style='max-width:640px;display:block;margin-bottom:10px;'/>" for p in example_imgs)}
</body></html>
"""
report_path = os.path.join(out_dir, "forecast_report.html")
with open(report_path, "w", encoding="utf-8") as f:
    f.write(html)

print("Saved:", metrics_csv, "and", report_path)


KNN   → RMSE=20.3636, sMAPE=0.4884, N=1964
ENet  → RMSE=36.7395, sMAPE=1.1923, N=1964
Saved: ./forecast_exports/metrics.csv and ./forecast_exports/forecast_report.html
