In [None]:
# visualization_advanced.py
import os

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


TYPE_LABELS = {
    "h": "House",
    "u": "Unit/Apartment",
    "t": "Townhouse",
}


def _require_columns(df: pd.DataFrame, cols: list[str]) -> None:
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")


def _with_type_labels(df: pd.DataFrame, type_col: str = "Type") -> pd.DataFrame:
    if type_col not in df.columns:
        return df.copy()
    d = df.copy()
    s = d[type_col].astype(str)
    d["TypeLabel"] = s.map(TYPE_LABELS).fillna(s)
    return d


def _ensure_price_per_m2(df: pd.DataFrame, price_col: str = "Price", area_col: str = "BuildingArea") -> pd.DataFrame:
    """Если PricePerM2 отсутствует — попытаемся вычислить."""
    d = df.copy()
    if "PricePerM2" in d.columns:
        return d

    if price_col in d.columns and area_col in d.columns:
        p = pd.to_numeric(d[price_col], errors="coerce")
        a = pd.to_numeric(d[area_col], errors="coerce").where(lambda x: x > 0)
        d["PricePerM2"] = p / a
        return d

    d["PricePerM2"] = np.nan
    return d


def _segment_market_by_quantiles(df: pd.DataFrame, ppm2_col: str = "PricePerM2") -> pd.DataFrame:
    d = df.copy()
    s = pd.to_numeric(d[ppm2_col], errors="coerce")
    s = s.replace([np.inf, -np.inf], np.nan)

    q1 = s.quantile(0.33)
    q2 = s.quantile(0.66)

    # Если данных мало/все одинаковые — fallback
    if not np.isfinite(q1) or not np.isfinite(q2) or q1 == q2:
        d["MarketSegment"] = pd.Series(["Unknown"] * len(d), index=d.index)
        return d

    d["MarketSegment"] = pd.cut(
        s,
        bins=[-np.inf, q1, q2, np.inf],
        labels=["Budget", "Mid", "Premium"],
        include_lowest=True,
    ).astype("object").fillna("Unknown")

    return d


def create_scatter_area_price(df, area_col="BuildingArea", price_col="Price", x_max=800, y_max=8_000_000):
    _require_columns(df, [area_col, price_col])

    d = df[[area_col, price_col]].copy()
    d[area_col] = pd.to_numeric(d[area_col], errors="coerce")
    d[price_col] = pd.to_numeric(d[price_col], errors="coerce")
    d = d.dropna()
    d = d[(d[area_col] > 0) & (d[price_col] > 0)]

    if x_max is not None:
        d = d[d[area_col] <= x_max]
    if y_max is not None:
        d = d[d[price_col] <= y_max]

    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=d[area_col],
            y=d[price_col],
            mode="markers",
            marker=dict(size=6, opacity=0.6),
            hovertemplate=f"{area_col}: %{{x:.1f}}<br>{price_col}: $%{{y:,.0f}}<extra></extra>",
            name="Data",
        )
    )

    if len(d) >= 2:
        x = d[area_col].to_numpy(dtype=float)
        y = d[price_col].to_numpy(dtype=float)
        k, b = np.polyfit(x, y, 1)
        x_line = np.linspace(float(x.min()), float(x.max()), 100)
        y_line = k * x_line + b
        fig.add_trace(go.Scatter(x=x_line, y=y_line, mode="lines", name="Trend"))

    fig.update_layout(
        title="Зависимость стоимости от площади (BuildingArea → Price)",
        xaxis_title="BuildingArea (м²)",
        yaxis_title="Price ($)",
        height=650,
        template="plotly_white",
        showlegend=False,
    )
    return fig


def create_box_area_by_type(df, type_col="Type", area_col="BuildingArea", y_max=750):
    _require_columns(df, [type_col, area_col])

    d = _with_type_labels(df, type_col=type_col)[["TypeLabel", area_col]].copy()
    d[area_col] = pd.to_numeric(d[area_col], errors="coerce")
    d = d.dropna()
    d = d[d[area_col] > 0]

    fig = px.box(
        d,
        x="TypeLabel",
        y=area_col,
        color="TypeLabel",
        title="Площадь здания по типу жилья",
        labels={"TypeLabel": "Тип жилья", area_col: "BuildingArea (м²)"},
    )
    if y_max is not None:
        fig.update_yaxes(range=[0, y_max])

    fig.update_layout(height=600, showlegend=False, template="plotly_white")
    return fig


def create_type_pie(df, type_col="Type"):
    _require_columns(df, [type_col])

    d = _with_type_labels(df, type_col=type_col)
    counts = d["TypeLabel"].value_counts(dropna=False).reset_index()
    counts.columns = ["TypeLabel", "count"]

    fig = px.pie(
        counts,
        names="TypeLabel",
        values="count",
        title="Доли типов жилья",
        labels={"TypeLabel": "Тип жилья", "count": "Количество"},
    )
    fig.update_layout(height=520, template="plotly_white")
    return fig


def create_price_distribution(df, price_col="Price"):
    _require_columns(df, [price_col])

    s = pd.to_numeric(df[price_col], errors="coerce").dropna()
    if s.empty:
        raise ValueError("No numeric price data after cleaning")

    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    clean_s = s[(s >= q1 - 1.5 * iqr) & (s <= q3 + 1.5 * iqr)]

    fig = go.Figure()
    fig.add_trace(go.Histogram(x=clean_s, nbinsx=50, opacity=0.85))
    fig.update_layout(
        title="Распределение цен (без выбросов по IQR)",
        xaxis_title="Price ($)",
        yaxis_title="Count",
        height=520,
        template="plotly_white",
        showlegend=False,
    )
    return fig


def create_heatmap_prices(
    df,
    price_col="Price",
    district_col="Suburb",
    type_col="Type",
    top_districts=20,
    top_types=3,
    agg="median",
):
    _require_columns(df, [price_col, district_col, type_col])

    d = _with_type_labels(df, type_col=type_col)[[price_col, district_col, "TypeLabel"]].copy()
    d[price_col] = pd.to_numeric(d[price_col], errors="coerce")
    d = d.dropna(subset=[price_col, district_col, "TypeLabel"])

    top_ds = d[district_col].value_counts().head(top_districts).index
    top_ts = d["TypeLabel"].value_counts().head(top_types).index
    d = d[d[district_col].isin(top_ds) & d["TypeLabel"].isin(top_ts)]

    if agg == "mean":
        pivot = d.groupby([district_col, "TypeLabel"])[price_col].mean().unstack()
        title = "Heatmap (top): район × тип жилья (средняя цена)"
    else:
        pivot = d.groupby([district_col, "TypeLabel"])[price_col].median().unstack()
        title = "Heatmap (top): район × тип жилья (медианная цена)"

    pivot = pivot.loc[pivot.mean(axis=1).sort_values(ascending=False).index]

    fig = go.Figure(
        data=go.Heatmap(
            z=pivot.values,
            x=pivot.columns,
            y=pivot.index,
            colorscale="RdYlGn_r",
            hovertemplate="%{y} — %{x}<br>$%{z:,.0f}<extra></extra>",
        )
    )
    fig.update_layout(
        title=title,
        xaxis_title="Тип жилья",
        yaxis_title="Район (топ по числу объявлений)",
        height=650,
        template="plotly_white",
    )
    return fig


def create_price_box_by_type(df, price_col="Price", type_col="Type"):
    _require_columns(df, [price_col, type_col])

    d = _with_type_labels(df, type_col=type_col)[[price_col, "TypeLabel"]].copy()
    d[price_col] = pd.to_numeric(d[price_col], errors="coerce")
    d = d.dropna(subset=[price_col, "TypeLabel"])

    fig = px.box(
        d,
        x="TypeLabel",
        y=price_col,
        color="TypeLabel",
        title="Boxplot: цены по типам жилья",
        labels={"TypeLabel": "Тип жилья", price_col: "Price ($)"},
    )
    fig.update_layout(height=520, showlegend=False, template="plotly_white")
    return fig


def create_top5_districts_by_price(df, price_col="Price", district_col="Suburb"):
    _require_columns(df, [price_col, district_col])

    d = df[[price_col, district_col]].copy()
    d[price_col] = pd.to_numeric(d[price_col], errors="coerce")
    d = d.dropna(subset=[price_col, district_col])

    g = d.groupby(district_col)[price_col].agg(median="median", count="count")
    top = g.sort_values("median", ascending=False).head(5)

    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x=top["median"].values,
            y=top.index,
            orientation="h",
            text=top["count"].values,
            texttemplate="n=%{text}",
            hovertemplate="%{y}<br>Median: $%{x:,.0f}<br>Count: %{text}<extra></extra>",
        )
    )
    fig.update_layout(
        title="Топ-5 районов по медианной цене (и числу объявлений)",
        xaxis_title="Медианная цена ($)",
        yaxis_title="Район",
        height=520,
        template="plotly_white",
        showlegend=False,
    )
    return fig


# -------- НОВЫЙ ФУНКЦИОНАЛ --------
def create_feature_correlation_bar(df: pd.DataFrame, target_col: str = "Price", top_k: int = 12):
    """Топ-K числовых признаков по |corr| с target_col."""
    _require_columns(df, [target_col])

    d = df.copy()
    # берем только числовые признаки
    num = d.select_dtypes(include=[np.number]).copy()
    if target_col not in num.columns:
        num[target_col] = pd.to_numeric(d[target_col], errors="coerce")

    corr = num.corr(numeric_only=True)[target_col].drop(labels=[target_col], errors="ignore")
    corr = corr.dropna()
    if corr.empty:
        raise ValueError("No numeric features to correlate with target")

    top = corr.reindex(corr.abs().sort_values(ascending=False).head(top_k).index)

    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x=top.values,
            y=top.index,
            orientation="h",
            hovertemplate="%{y}<br>corr=%{x:.3f}<extra></extra>",
        )
    )
    fig.update_layout(
        title=f"Что влияет на {target_col}: корреляции (топ-{len(top)})",
        xaxis_title="Корреляция (Pearson)",
        yaxis_title="Признак",
        height=600,
        template="plotly_white",
        showlegend=False,
    )
    return fig


def create_model_feature_importance(df: pd.DataFrame, target_col: str = "PricePerM2", top_k: int = 12):
    """
    Важность признаков через RandomForestRegressor (если sklearn доступен),
    иначе fallback: корреляции.
    """
    try:
        from sklearn.ensemble import RandomForestRegressor
    except Exception:
        return create_feature_correlation_bar(df, target_col=target_col, top_k=top_k)

    _require_columns(df, [target_col])

    d = df.copy()
    y = pd.to_numeric(d[target_col], errors="coerce")
    X = d.select_dtypes(include=[np.number]).copy()

    # target из X убираем
    if target_col in X.columns:
        X = X.drop(columns=[target_col])

    # чистка
    X = X.replace([np.inf, -np.inf], np.nan)
    y = y.replace([np.inf, -np.inf], np.nan)
    mask = y.notna()
    X = X.loc[mask]
    y = y.loc[mask]

    if X.shape[1] == 0 or len(y) < 50:
        return create_feature_correlation_bar(df, target_col=target_col, top_k=top_k)

    # fillna медианами
    for c in X.columns:
        X[c] = X[c].fillna(X[c].median())

    model = RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        max_depth=None,
    )
    model.fit(X, y)

    imp = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False).head(top_k)

    fig = go.Figure()
    fig.add_trace(
        go.Bar(
            x=imp.values,
            y=imp.index,
            orientation="h",
            hovertemplate="%{y}<br>importance=%{x:.4f}<extra></extra>",
        )
    )
    fig.update_layout(
        title=f"Что влияет на {target_col}: важность признаков (RandomForest, топ-{len(imp)})",
        xaxis_title="Feature importance",
        yaxis_title="Признак",
        height=600,
        template="plotly_white",
        showlegend=False,
    )
    return fig


def create_segment_overview_plots(df: pd.DataFrame, ppm2_col: str = "PricePerM2", price_col: str = "Price"):
    """Графики сегментации: сравнение сегментов по PricePerM2 и Price."""
    _require_columns(df, [ppm2_col])

    d = df.copy()
    d[ppm2_col] = pd.to_numeric(d[ppm2_col], errors="coerce")
    if price_col in d.columns:
        d[price_col] = pd.to_numeric(d[price_col], errors="coerce")

    d = d.dropna(subset=[ppm2_col, "MarketSegment"])

    # bar: медианная цена за м2 по сегментам
    seg_stats = d.groupby("MarketSegment")[ppm2_col].median().sort_values()
    fig1 = go.Figure()
    fig1.add_trace(
        go.Bar(
            x=seg_stats.index.astype(str),
            y=seg_stats.values,
            hovertemplate="%{x}<br>median=%{y:,.0f}<extra></extra>",
        )
    )
    fig1.update_layout(
        title="Сегментация рынка по PricePerM2: медиана по сегментам",
        xaxis_title="Сегмент",
        yaxis_title="Median PricePerM2",
        height=520,
        template="plotly_white",
        showlegend=False,
    )

    # box: распределение PricePerM2 по сегментам
    fig2 = px.box(
        d,
        x="MarketSegment",
        y=ppm2_col,
        color="MarketSegment",
        title="Сегментация рынка: распределение PricePerM2 по сегментам",
        labels={"MarketSegment": "Сегмент", ppm2_col: "PricePerM2"},
    )
    fig2.update_layout(height=520, showlegend=False, template="plotly_white")

    return fig1, fig2


def summarize_segments(df: pd.DataFrame, price_col: str = "Price", ppm2_col: str = "PricePerM2") -> pd.DataFrame:
    cols = [c for c in ["MarketSegment", price_col, ppm2_col, "Rooms", "Bathroom", "BuildingArea"] if c in df.columns]
    d = df[cols].copy()
    if price_col in d.columns:
        d[price_col] = pd.to_numeric(d[price_col], errors="coerce")
    if ppm2_col in d.columns:
        d[ppm2_col] = pd.to_numeric(d[ppm2_col], errors="coerce")

    out = d.groupby("MarketSegment").agg(
        count=("MarketSegment", "count"),
        median_price=(price_col, "median") if price_col in d.columns else ("MarketSegment", "count"),
        median_ppm2=(ppm2_col, "median") if ppm2_col in d.columns else ("MarketSegment", "count"),
        median_rooms=("Rooms", "median") if "Rooms" in d.columns else ("MarketSegment", "count"),
        median_bathroom=("Bathroom", "median") if "Bathroom" in d.columns else ("MarketSegment", "count"),
        median_area=("BuildingArea", "median") if "BuildingArea" in d.columns else ("MarketSegment", "count"),
    )
    return out.reset_index()


def export_analysis_report(
    df,
    output_dir="output",  # параметр оставлен для совместимости с main.py
    price_column="Price",
    district_column="Suburb",
    category_column="Type",
    area_column="BuildingArea",
):
    """
    Экспортирует набор .jpg и доп. CSV рядом с файлом visualization_advanced.py.
    Для сохранения jpg через fig.write_image нужен kaleido.
    """
    base_dir = os.path.dirname(os.path.abspath(__file__))

    # подготовка
    d = _with_type_labels(df, type_col=category_column)
    d = _ensure_price_per_m2(d, price_col=price_column, area_col=area_column)
    d = _segment_market_by_quantiles(d, ppm2_col="PricePerM2")

    # --- сводки по сегментам (CSV) ---
    try:
        seg_tbl = summarize_segments(d, price_col=price_column, ppm2_col="PricePerM2")
        seg_tbl.to_csv(os.path.join(base_dir, "segment_summary.csv"), index=False)
    except Exception:
        pass

    # --- графики ---
    figures = [
        ("01_dashboard", create_main_dashboard(d, price_col=price_column, district_col=district_column, type_col=category_column)),
        ("02_scatter_area_price", create_scatter_area_price(d, area_col=area_column, price_col=price_column)),
        ("03_price_distribution", create_price_distribution(d, price_col=price_column)),
        ("04_heatmap_top", create_heatmap_prices(d, price_col=price_column, district_col=district_column, type_col=category_column, top_districts=20, top_types=3, agg="median")),
        ("05_box_area_by_type", create_box_area_by_type(d, type_col=category_column, area_col=area_column)),
        ("06_price_box_by_type", create_price_box_by_type(d, price_col=price_column, type_col=category_column)),
        ("07_top5_districts", create_top5_districts_by_price(d, price_col=price_column, district_col=district_column)),

        # НОВОЕ: факторы (корреляции + RF importance)
        ("08_factors_corr_price", create_feature_correlation_bar(d, target_col=price_column, top_k=12)),
        ("09_factors_corr_ppm2", create_feature_correlation_bar(d, target_col="PricePerM2", top_k=12)),
        ("10_factors_rf_ppm2", create_model_feature_importance(d, target_col="PricePerM2", top_k=12)),
    ]

    # НОВОЕ: сегментация (2 графика)
    try:
        seg_bar, seg_box = create_segment_overview_plots(d, ppm2_col="PricePerM2", price_col=price_column)
        figures.extend([
            ("11_segment_median_ppm2", seg_bar),
            ("12_segment_box_ppm2", seg_box),
        ])
    except Exception:
        pass

    # --- сохранение JPG рядом с кодом ---
    try:
        for name, fig in figures:
            out_path = os.path.join(base_dir, f"{name}.jpg")
            fig.write_image(out_path, format="jpg", width=1400, height=900, scale=2)
    except Exception as e:
        err_path = os.path.join(base_dir, "JPG_EXPORT_ERROR.txt")
        with open(err_path, "w", encoding="utf-8") as f:
            f.write("Не удалось сохранить JPG через Plotly.\n")
            f.write("Для fig.write_image поставь kaleido: pip install -U kaleido\n\n")
            f.write(f"Ошибка: {repr(e)}\n")

    return base_dir


# NOTE: create_main_dashboard вынесен ниже (чтобы было в одном файле)
def create_main_dashboard(df, price_col="Price", district_col="Suburb", type_col="Type"):
    _require_columns(df, [price_col, district_col, "TypeLabel" if "TypeLabel" in df.columns else type_col])

    d = df.copy()
    if "TypeLabel" not in d.columns and type_col in d.columns:
        d = _with_type_labels(d, type_col=type_col)

    d = d[[price_col, district_col, "TypeLabel"]].copy()
    d[price_col] = pd.to_numeric(d[price_col], errors="coerce")
    d = d.dropna(subset=[price_col, district_col, "TypeLabel"])

    prices_by_type = d.groupby("TypeLabel")[price_col].median().sort_values(ascending=False)
    prices_by_district = d.groupby(district_col)[price_col].median().sort_values(ascending=False).head(10)
    count_by_type = d["TypeLabel"].value_counts()

    fig = make_subplots(
        rows=2,
        cols=2,
        subplot_titles=(
            "Медианная цена по типам жилья",
            "Топ-10 районов по медианной цене",
            "Количество объявлений по типам",
            "Статистика по ценам",
        ),
        specs=[[{"type": "bar"}, {"type": "bar"}], [{"type": "pie"}, {"type": "table"}]],
    )

    fig.add_trace(
        go.Bar(
            x=prices_by_type.index,
            y=prices_by_type.values,
            hovertemplate="%{x}<br>$%{y:,.0f}<extra></extra>",
        ),
        row=1, col=1
    )

    fig.add_trace(
        go.Bar(
            x=prices_by_district.values,
            y=prices_by_district.index,
            orientation="h",
            hovertemplate="%{y}<br>$%{x:,.0f}<extra></extra>",
        ),
        row=1, col=2
    )

    fig.add_trace(
        go.Pie(
            labels=count_by_type.index,
            values=count_by_type.values,
            hovertemplate="%{label}<br>%{value} (%{percent})<extra></extra>",
        ),
        row=2, col=1
    )

    stats_data = {
        "Метрика": ["Средняя", "Медиана", "Мин", "Макс", "Std"],
        "Значение": [
            f"${d[price_col].mean():,.0f}",
            f"${d[price_col].median():,.0f}",
            f"${d[price_col].min():,.0f}",
            f"${d[price_col].max():,.0f}",
            f"${d[price_col].std():,.0f}",
        ],
    }

    fig.add_trace(
        go.Table(
            header=dict(values=["Метрика", "Значение"], align="center"),
            cells=dict(values=[stats_data["Метрика"], stats_data["Значение"]], align="left"),
        ),
        row=2, col=2
    )

    fig.update_layout(
        title="Дашборд: анализ рынка недвижимости",
        height=950,
        showlegend=False,
        template="plotly_white",
    )
    return fig
