In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Dict

# ============================================================
# ПУТЬ К ДАННЫМ
# ============================================================

BASE_DIR = Path(r"D:\nto_ii_individ")   # папка с train.csv, test.csv, books.csv и т.д.

def smart_read_csv(filename: str) -> pd.DataFrame:
    path = BASE_DIR / filename
    for sep in [";", ","]:
        try:
            df = pd.read_csv(path, sep=sep)
            if df.shape[1] > 1:
                return df
        except Exception:
            pass
    return pd.read_csv(path)


# ============================================================
# MODEL A: SUPER + MEGA + TEMPORAL
# ============================================================

def calculate_super_aggregates(train_df: pd.DataFrame) -> Dict:
    train_read = train_df[train_df["has_read"] == 1].copy()
    train_read["rating"] = train_read["rating"].astype(float)

    global_stats = {
        "mean": float(train_read["rating"].mean()),
        "std": float(train_read["rating"].std()),
        "count": int(len(train_read)),
    }

    user_super_agg = (
        train_read.groupby("user_id")["rating"]
        .agg(["mean", "count", "std", "min", "max", "median"])
        .reset_index()
    )
    user_super_agg.columns = [
        "user_id",
        "user_mean",
        "user_count",
        "user_std",
        "user_min",
        "user_max",
        "user_median",
    ]

    book_super_agg = (
        train_read.groupby("book_id")["rating"]
        .agg(["mean", "count", "std", "min", "max", "median"])
        .reset_index()
    )
    book_super_agg.columns = [
        "book_id",
        "book_mean",
        "book_count",
        "book_std",
        "book_min",
        "book_max",
        "book_median",
    ]

    user_super_agg["user_confidence"] = np.sqrt(user_super_agg["user_count"]) / 10.0
    book_super_agg["book_confidence"] = np.sqrt(book_super_agg["book_count"]) / 12.0

    return {
        "global_stats": global_stats,
        "user_super_agg": user_super_agg,
        "book_super_agg": book_super_agg,
    }


def super_predict(user_id: int, book_id: int, aggregates: Dict) -> float:
    user_data = aggregates["user_super_agg"][
        aggregates["user_super_agg"]["user_id"] == user_id
    ]
    book_data = aggregates["book_super_agg"][
        aggregates["book_super_agg"]["book_id"] == book_id
    ]
    global_stats = aggregates["global_stats"]

    predictions = []
    super_weights = []

    # USER
    if len(user_data) > 0:
        user_row = user_data.iloc[0]
        user_base_weight = min(user_row["user_count"] / 8.0, 1.2)

        # базовое среднее пользователя
        predictions.append(user_row["user_mean"])

        # медиана, если достаточно оценок
        if user_row["user_count"] >= 5:
            predictions.append(user_row["user_median"])
            super_weights.append(user_base_weight * 0.7)

        # «стабильные» пользователи
        if not np.isnan(user_row["user_std"]) and user_row["user_std"] < 2.0:
            predictions.append(user_row["user_mean"])
            super_weights.append(user_base_weight * 1.1)

        # вес для базового среднего
        super_weights.append(user_base_weight)

    # BOOK
    if len(book_data) > 0:
        book_row = book_data.iloc[0]
        book_base_weight = min(book_row["book_count"] / 10.0, 1.0)

        predictions.append(book_row["book_mean"])

        if book_row["book_count"] >= 8:
            predictions.append(book_row["book_median"])
            super_weights.append(book_base_weight * 0.6)

        if not np.isnan(book_row["book_std"]) and book_row["book_std"] < 1.5:
            predictions.append(book_row["book_mean"])
            super_weights.append(book_base_weight * 0.9)

        super_weights.append(book_base_weight)

    # fallback’и
    if len(predictions) == 0:
        # как в оригинальном коде – немного шума вокруг глобального среднего
        return float(global_stats["mean"] + np.random.normal(0, 0.1))
    elif len(predictions) == 1:
        w = super_weights[0]
        alpha = 0.9 if w > 0.7 else 0.7
        return float(alpha * predictions[0] + (1 - alpha) * global_stats["mean"])
    else:
        final_pred = float(np.average(predictions, weights=super_weights))
        if abs(final_pred - global_stats["mean"]) > 2 * global_stats["std"]:
            return float(0.7 * final_pred + 0.3 * global_stats["mean"])
        return final_pred


def mega_predict(user_id: int, book_id: int, aggregates: Dict) -> float:
    user_data = aggregates["user_super_agg"][
        aggregates["user_super_agg"]["user_id"] == user_id
    ]
    book_data = aggregates["book_super_agg"][
        aggregates["book_super_agg"]["book_id"] == book_id
    ]
    global_stats = aggregates["global_stats"]

    strategies = []

    if len(user_data) > 0:
        u = user_data.iloc[0]
        strategies.append(u["user_mean"])
        if u["user_count"] > 3:
            strategies.append(u["user_median"])

    if len(book_data) > 0:
        b = book_data.iloc[0]
        strategies.append(b["book_mean"])
        if b["book_count"] > 5:
            strategies.append(b["book_median"])

    if len(strategies) >= 2 and len(user_data) > 0:
        u = user_data.iloc[0]
        avg_pred = float(np.mean(strategies))
        avg_pred = float(np.clip(avg_pred, u["user_min"], u["user_max"]))
        strategies.append(avg_pred)

    if len(strategies) == 0:
        return global_stats["mean"]

    return float(np.median(strategies))


def fit_model_A(train_df: pd.DataFrame) -> Dict:
    train_read = train_df[train_df["has_read"] == 1].copy()
    train_read["rating"] = train_read["rating"].astype(float)

    aggregates = calculate_super_aggregates(train_df)

    user_means = (
        train_read.groupby("user_id")["rating"].mean().reset_index()
    )
    user_means.columns = ["user_id", "user_mean_temp"]

    book_means = (
        train_read.groupby("book_id")["rating"].mean().reset_index()
    )
    book_means.columns = ["book_id", "book_mean_temp"]

    global_mean = float(train_read["rating"].mean())

    return {
        "aggregates": aggregates,
        "user_means_temp": user_means,
        "book_means_temp": book_means,
        "global_mean": global_mean,
    }


def predict_model_A(pairs: pd.DataFrame, modelA: Dict, calibrate: bool) -> pd.DataFrame:
    aggs = modelA["aggregates"]
    global_mean = modelA["global_mean"]

    df = pairs.copy()

    df["super_pred"] = df.apply(
        lambda r: super_predict(r["user_id"], r["book_id"], aggs),
        axis=1,
    )
    df["mega_pred"] = df.apply(
        lambda r: mega_predict(r["user_id"], r["book_id"], aggs),
        axis=1,
    )

    df = df.merge(
        modelA["user_means_temp"], on="user_id", how="left"
    )
    df = df.merge(
        modelA["book_means_temp"], on="book_id", how="left"
    )

    df["user_mean_temp"] = df["user_mean_temp"].fillna(global_mean)
    df["book_mean_temp"] = df["book_mean_temp"].fillna(global_mean)

    df["temporal_pred"] = (
        0.6 * df["user_mean_temp"] + 0.4 * df["book_mean_temp"]
    )

    df["rating_predict"] = (
        df["super_pred"] * 0.35
        + df["mega_pred"] * 0.25
        + df["temporal_pred"] * 0.40
    )

    if calibrate:
        shift = global_mean - float(df["rating_predict"].mean())
        df["rating_predict"] = df["rating_predict"] + shift

    df["rating_predict"] = df["rating_predict"].clip(0, 10)

    return df[["user_id", "book_id", "rating_predict"]]


# ============================================================
# ОСНОВНОЙ ПРОЦЕСС
# ============================================================

def main():
    np.random.seed(42)  # фиксируем сид для super_predict fallbacks

    # 1. Загрузка данных
    train = smart_read_csv("train.csv")
    test = smart_read_csv("test.csv")

    print(f"Train shape: {train.shape}, Test shape: {test.shape}")

    # 2. Обучение модели
    print("\n=== Обучение модели ===")
    modelA_full = fit_model_A(train)

    test_pairs = test[["user_id", "book_id"]].copy()

    # 3. Предсказание БЕЗ калибровки
    print("\nПредсказание (без калибровки)...")
    sub_no_cal = predict_model_A(test_pairs, modelA_full, calibrate=False)

    print("Base submission shape:", sub_no_cal.shape)
    print(
        f"Base: range={sub_no_cal['rating_predict'].min():.3f}..{sub_no_cal['rating_predict'].max():.3f}, "
        f"mean={sub_no_cal['rating_predict'].mean():.3f}"
    )

    # 4. Поджимаем высокие значения (>= 9.5) к 10
    preds = sub_no_cal["rating_predict"].astype(float).values
    thr = 9.5
    preds_clipped = preds.copy()
    mask = preds_clipped >= thr
    preds_clipped[mask] = 10.0
    
    # 5. Создаем финальный submission
    final_submission = sub_no_cal[["user_id", "book_id"]].copy()
    final_submission["rating_predict"] = np.clip(preds_clipped, 0, 10)

    # 6. Сохраняем ОДИН файл
    output_path = BASE_DIR / "submission_highclip_95.csv"
    final_submission.to_csv(output_path, index=False)

    print(f"\nФинальный submission сохранен в: {output_path}")
    print(f"Диапазон предсказаний: {final_submission['rating_predict'].min():.3f}..{final_submission['rating_predict'].max():.3f}")
    print(f"Среднее предсказание: {final_submission['rating_predict'].mean():.3f}")
    print(f"Строк с коррекцией (>=9.5 -> 10): {mask.sum()} (~{mask.mean()*100:.2f}% от теста)")


if __name__ == "__main__":
    main()