In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from statsmodels.nonparametric.smoothers_lowess import lowess
from sklearn.linear_model import LinearRegression

# ============================================================
# 1) DATA LOAD + df_ts
# ============================================================
DATA_DIR = Path("../../01_data") / "predictive_model"
df = pd.read_parquet(DATA_DIR / "df_auto_corpus_area_tech.parquet")

df = df[df["source_type"].isin(["paper", "patent"])].copy()
df = df.dropna(subset=["year", "month"])
df["year"] = df["year"].astype(int)
df["month"] = df["month"].astype(int)
df["date"] = pd.to_datetime(dict(year=df["year"], month=df["month"], day=1))

g = df.groupby(
    ["auto_focus_area", "auto_tech_cluster", "date", "source_type"]
).size().reset_index(name="n")

pivot = g.pivot_table(
    index=["auto_focus_area", "auto_tech_cluster", "date"],
    columns="source_type",
    values="n",
    fill_value=0
).reset_index()

pivot["n_total"] = pivot.get("paper", 0) + pivot.get("patent", 0)
pivot["patent"] = pivot.get("patent", 0)
pivot["share_patent"] = pivot["patent"] / pivot["n_total"].replace(0, np.nan)

df_ts = pivot[[
    "auto_focus_area",
    "auto_tech_cluster",
    "date",
    "n_total",
    "share_patent"
]].copy()

# ============================================================
# 2) Tek loop: grafikler + growth + macro soyutlama + feat
# ============================================================
OUT_DIR = Path("tech_lowess_hype")
OUT_DIR.mkdir(exist_ok=True)

FRAC = 0.3          # LOWESS band genişliği
MIN_POINTS = 10     # LOWESS için min ay

rows_feat = []
rows_growth = []
rows_macro = []

for (area, tech), g_ts in df_ts.groupby(["auto_focus_area", "auto_tech_cluster"]):
    g_ts = g_ts.sort_values("date")
    n_months = len(g_ts)

    # ---- feature özetleri (her halükârda) ----
    rows_feat.append({
        "auto_focus_area": area,
        "auto_tech_cluster": tech,
        "n_months": n_months,
        "n_years": n_months / 12,
        "total_docs": g_ts["n_total"].sum(),
        "mean_docs": g_ts["n_total"].mean(),
        "max_docs": g_ts["n_total"].max(),
        "last_docs": g_ts["n_total"].iloc[-1],
        "mean_share_patent_ts": g_ts["share_patent"].mean(),
        "std_share_patent_ts": g_ts["share_patent"].std(ddof=0),
    })

    # ---- growth (lineer trend) ----
    if n_months >= 6:
        X = np.arange(n_months).reshape(-1, 1)
        y = g_ts["n_total"].values
        model = LinearRegression().fit(X, y)
        rows_growth.append({
            "auto_focus_area": area,
            "auto_tech_cluster": tech,
            "growth_slope_n_docs": float(model.coef_[0]),
            "n_docs_last": int(y[-1]),
        })

    # ---- LOWESS + 1. / 2. türev + makro şok + grafikler ----
    if n_months >= MIN_POINTS:
        t = np.arange(n_months, dtype=float)
        cum_docs = g_ts["n_total"].cumsum().values.astype(float)
        y = np.log1p(cum_docs)

        y_smooth = lowess(y, t, frac=FRAC, return_sorted=False)
        d1 = np.diff(y_smooth)
        d2 = np.diff(d1)

        t0 = g_ts["date"].values
        t1 = g_ts["date"].iloc[1:].values
        t2 = g_ts["date"].iloc[2:].values

        # --- cumulative ---
        plt.figure()
        plt.plot(t0, np.expm1(y_smooth))
        plt.title(f"{tech} – Cumulative docs (LOWESS)")
        plt.xlabel("Date")
        plt.ylabel("Cumulative docs (smoothed)")
        plt.savefig(OUT_DIR / f"{area}_{tech}_cum_lowess.png")
        plt.close()

        # --- 1. türev ---
        plt.figure()
        plt.plot(t1, d1)
        plt.title(f"{tech} – 1st Derivative (LOWESS)")
        plt.xlabel("Date")
        plt.ylabel("Δ log-cum docs")
        plt.savefig(OUT_DIR / f"{area}_{tech}_d1_lowess.png")
        plt.close()

        # --- 2. türev ---
        plt.figure()
        plt.plot(t2, d2)
        plt.title(f"{tech} – 2nd Derivative (LOWESS)")
        plt.xlabel("Date")
        plt.ylabel("Δ² log-cum docs")
        plt.savefig(OUT_DIR / f"{area}_{tech}_d2_lowess.png")
        plt.close()

        # --- makro şok soyutlama (en büyük +/- spike) ---
        if len(d2) > 0:
            idx_pos = int(np.argmax(d2))
            idx_neg = int(np.argmin(d2))
            rows_macro.append({
                "auto_focus_area": area,
                "auto_tech_cluster": tech,
                "pos_shock_date": g_ts["date"].iloc[idx_pos + 2],
                "pos_shock_strength": float(d2[idx_pos]),
                "neg_shock_date": g_ts["date"].iloc[idx_neg + 2],
                "neg_shock_strength": float(d2[idx_neg]),
                "macro_shock_score": float(np.max(np.abs(d2))),
            })

# ============================================================
# 3) DataFrame'leri kur & merge et
# ============================================================
df_feat   = pd.DataFrame(rows_feat)
df_growth = pd.DataFrame(rows_growth)
df_macro  = pd.DataFrame(rows_macro)

if not df_macro.empty:
    # macro_shock_score'u z-score'a çek
    m = df_macro["macro_shock_score"].values
    std = m.std(ddof=0)
    if std > 0:
        df_macro["macro_shock_score_z"] = (m - m.mean()) / std
    else:
        df_macro["macro_shock_score_z"] = 0.0
else:
    df_macro["macro_shock_score_z"] = []

df_feat = df_feat.merge(
    df_growth,
    on=["auto_focus_area", "auto_tech_cluster"],
    how="left",
)

df_feat = df_feat.merge(
    df_macro[
        ["auto_focus_area", "auto_tech_cluster",
         "pos_shock_date", "pos_shock_strength",
         "neg_shock_date", "neg_shock_strength",
         "macro_shock_score", "macro_shock_score_z"]
    ],
    on=["auto_focus_area", "auto_tech_cluster"],
    how="left",
)

print("✅ Grafikler kaydedildi klasöre:", OUT_DIR)
df_feat.head()
