In [None]:
# === Cell 1: imports & paths ===
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

DATA_DIR = Path("../../01_data") / "predictive_model"
CORPUS_PATH = DATA_DIR / "df_auto_corpus_area_tech.parquet"


In [None]:
# === Cell 2: load + basic filters (paper/patent + date) ===

df = pd.read_parquet(CORPUS_PATH)

# sadece paper + patent
df = df[df["source_type"].isin(["paper", "patent"])].copy()

# year / month NA'leri at
df = df.dropna(subset=["year", "month"])

df["year"] = df["year"].astype(int)
df["month"] = df["month"].astype(int)

# ayın ilk günü olacak şekilde tarih
df["date"] = pd.to_datetime(
    dict(year=df["year"], month=df["month"], day=1)
)

df.head()



In [None]:
# === Cell 3: area-tech-date level time series (df_ts) ===

g = df.groupby(
    ["auto_focus_area", "auto_tech_cluster", "date", "source_type"]
).size().reset_index(name="n")

pivot = g.pivot_table(
    index=["auto_focus_area", "auto_tech_cluster", "date"],
    columns="source_type",
    values="n",
    fill_value=0
).reset_index()

pivot.columns.name = None

for col in ["paper", "patent"]:
    if col not in pivot.columns:
        pivot[col] = 0

pivot["n_total"] = pivot["paper"] + pivot["patent"]
pivot["share_paper"] = np.where(pivot["n_total"] > 0, pivot["paper"] / pivot["n_total"], 0.0)
pivot["share_patent"] = np.where(pivot["n_total"] > 0, pivot["patent"] / pivot["n_total"], 0.0)

df_ts = pivot.copy()
df_ts.head()


In [None]:
# === Cell 4: helper functions – prepare_time_series & linear_forecast ===

def prepare_time_series(group_df: pd.DataFrame, value_col: str):
    g = group_df.sort_values("date")
    g = g[["date", value_col]].dropna()

    if len(g) < 4:
        return None, None

    t = np.arange(len(g)).reshape(-1, 1)
    y = g[value_col].values.astype(float)
    return t, y


def linear_forecast(group_df: pd.DataFrame, value_col: str, horizon: int = 12):
    g = group_df.sort_values("date")
    t, y = prepare_time_series(g, value_col)
    if t is None:
        return g, None

    model = LinearRegression()
    model.fit(t, y)

    last_t = t[-1, 0]
    future_t = np.arange(last_t + 1, last_t + 1 + horizon).reshape(-1, 1)
    y_pred = model.predict(future_t)

    last_date = g["date"].max()
    future_dates = pd.date_range(
        start=last_date + pd.offsets.MonthBegin(1),
        periods=horizon,
        freq="MS"
    )

    forecast_df = pd.DataFrame({
        "date": future_dates,
        f"forecast_{value_col}": y_pred
    })

    return g, forecast_df


In [None]:
# === Cell 5: logistic (S-curve) forecast ===
from scipy.optimize import curve_fit

def _logistic_fn(t, K, r, t0):
    return K / (1.0 + np.exp(-r * (t - t0)))


def logistic_forecast(group_df: pd.DataFrame,
                      value_col: str,
                      horizon: int = 12):
    """
    value_col: 'n_total' veya 'share_patent' gibi kolonlar için
    lojistik (S-curve) fit + forecast. Fit patlarsa linear_forecast'e düşer.
    """
    g = group_df.sort_values("date")
    t, y = prepare_time_series(g, value_col)
    if t is None:
        return g, None

    t = t.astype(float).ravel()

    y_min, y_max = float(y.min()), float(y.max())
    if y_max == y_min:
        return linear_forecast(g, value_col, horizon=horizon)

    y_norm = (y - y_min) / (y_max - y_min)

    p0 = [1.0, 0.3, np.median(t)]

    try:
        params, _ = curve_fit(
            _logistic_fn,
            t,
            y_norm,
            p0=p0,
            maxfev=10000
        )
        K, r, t0 = params

        last_t = t[-1]
        future_t = np.arange(last_t + 1, last_t + 1 + horizon)

        t_all = np.concatenate([t, future_t])
        y_pred_norm_all = _logistic_fn(t_all, K, r, t0)
        y_pred_norm_all = np.clip(y_pred_norm_all, 0.0, 1.0)

        y_pred_all = y_min + y_pred_norm_all * (y_max - y_min)
        y_future = y_pred_all[len(t):]

        last_date = g["date"].max()
        future_dates = pd.date_range(
            start=last_date + pd.offsets.MonthBegin(1),
            periods=horizon,
            freq="MS"
        )

        forecast_df = pd.DataFrame({
            "date": future_dates,
            f"logistic_{value_col}": y_future
        })

        return g, forecast_df

    except Exception:
        # fallback: linear
        return linear_forecast(g, value_col, horizon=horizon)



In [None]:
# === Cell 6: örnek bir area+tech için 4 eğri (raw, EWMA, linear, logistic) ===

sample = df_ts.iloc[0]
tmp = df_ts[
    (df_ts["auto_focus_area"] == sample["auto_focus_area"]) &
    (df_ts["auto_tech_cluster"] == sample["auto_tech_cluster"])
].copy()

actual_lin, forecast_lin = linear_forecast(tmp, "n_total", horizon=12)
actual_log, forecast_log = logistic_forecast(tmp, "n_total", horizon=12)

span = 6  # 6 aylık EWMA
actual_lin = actual_lin.sort_values("date")
actual_lin["n_total_ewma"] = actual_lin["n_total"].ewm(span=span, adjust=False).mean()

plt.figure(figsize=(9, 4))

plt.plot(actual_lin["date"], actual_lin["n_total"],
         alpha=0.3, label="actual volume (raw)")

plt.plot(actual_lin["date"], actual_lin["n_total_ewma"],
         linewidth=2, label="actual volume (EWMA)")

if forecast_lin is not None:
    plt.plot(forecast_lin["date"], forecast_lin["forecast_n_total"],
             linestyle="--", label="linear forecast")

if forecast_log is not None and "logistic_n_total" in forecast_log.columns:
    plt.plot(forecast_log["date"], forecast_log["logistic_n_total"],
             linestyle=":", label="logistic (S-curve) forecast")

plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# === Cell 7: global volume + EWMA + quarter işaretleri ===

df_global = (
    df.groupby("date")
      .size()
      .reset_index(name="n_total")
      .sort_values("date")
)

df_global["n_total_ewma"] = df_global["n_total"].ewm(span=6, adjust=False).mean()

plt.figure(figsize=(8, 4))
plt.plot(df_global["date"], df_global["n_total"], alpha=0.3, label="global raw")
plt.plot(df_global["date"], df_global["n_total_ewma"], linewidth=2, label="global EWMA")

quarters = df_global[df_global["date"].dt.month.isin([1, 4, 7, 10])]
plt.scatter(
    quarters["date"],
    [df_global["n_total_ewma"].min() * 0.95] * len(quarters),
    color="red",
    s=30,
    marker="|"
)

plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# === Cell 8: growth (hacim slope) hesapla ===

def estimate_growth(df_ts: pd.DataFrame):
    rows = []

    for (area, tech), g in df_ts.groupby(["auto_focus_area", "auto_tech_cluster"]):
        t, y = prepare_time_series(g, "n_total")
        if t is None:
            continue

        model = LinearRegression()
        model.fit(t, y)
        slope = float(model.coef_[0])

        rows.append({
            "auto_focus_area": area,
            "auto_tech_cluster": tech,
            "growth_slope_n_docs": slope,
            "n_docs_last": y[-1]
        })

    return pd.DataFrame(rows)

df_growth = estimate_growth(df_ts)
df_growth.sort_values("growth_slope_n_docs", ascending=False).head(20)



In [None]:
# === Yeni: 1. ve 2. türev temelli stage sınıflandırma ===

def _compute_trend_features(g: pd.DataFrame,
                            window: int = 6):
    """
    g: tek bir (area, tech) için zaman serisi
       kolonlar: ["date", "share_patent"]
    window: son kaç noktadan ortalama türev alınsın (ay)
    """
    g = g.sort_values("date")
    y = g["share_patent"].astype(float).values

    if len(y) < window + 3:
        # çok kısa seri → trend güvenilmez
        return None

    s = pd.Series(y)

    # 1. türev ~ ardışık fark
    d1 = s.diff()

    # son window içindeki ortalama eğim
    d1_recent = d1.iloc[-window:].mean()

    # bir önceki window içindeki ortalama eğim
    if len(d1) >= 2 * window:
        d1_prev = d1.iloc[-2*window:-window].mean()
    else:
        d1_prev = d1.iloc[:-window].mean()

    # 2. türev ~ eğimdeki değişim
    d2_recent = d1_recent - d1_prev

    last_share = s.iloc[-1]

    return last_share, d1_recent, d2_recent


def _classify_stage_d1d2(last_share: float,
                         d1_recent: float,
                         d2_recent: float,
                         min_app: float = 0.7,
                         max_research: float = 0.3,
                         d1_small: float = 0.01):
    """
    last_share  : son patent payı
    d1_recent   : son window ortalama 1. türev
    d2_recent   : 1. türevdeki değişim (2. türev)
    d1_small    : 'yaklaşık sıfır' için eşik
    """

    # sert düşüş: yüksek patent payı + negatif eğim + ivme de negatif
    if last_share >= min_app and d1_recent < -d1_small and d2_recent <= 0:
        return "Over-Mature"

    # güçlü uygulama fazı: yüksek pay + pozitif ya da düz eğim
    if last_share >= min_app and d1_recent >= -d1_small:
        return "Application Now"

    # hala araştırma: düşük pay + eğim zayıf ya da negatif
    if last_share <= max_research and d1_recent <= d1_small:
        return "Still Research"

    # geçiş / hızlanma: pay orta + eğim pozitif, ivme de genelde pozitif
    if d1_recent > d1_small:
        return "Transitioning"

    # fallback
    return "Transitioning"


def forecast_paper_to_patent_shift_d1d2(df_ts: pd.DataFrame,
                                        window: int = 6,
                                        min_total_docs: int = 10) -> pd.DataFrame:
    """
    df_ts en az şu kolonlara sahip:
      - auto_focus_area
      - auto_tech_cluster
      - date
      - paper
      - patent
      - n_total
      - share_patent
    """
    rows = []

    for (area, tech), g in df_ts.groupby(["auto_focus_area", "auto_tech_cluster"]):

        # toplam doküman sayısı (paper+patent)
        if "n_total" in g.columns:
            total_docs = g["n_total"].sum()
        else:
            total_docs = (g["paper"] + g["patent"]).sum()

        if total_docs < min_total_docs:
            continue

        feats = _compute_trend_features(g[["date", "share_patent"]], window=window)
        if feats is None:
            continue

        last_share, d1_recent, d2_recent = feats
        stage = _classify_stage_d1d2(last_share, d1_recent, d2_recent)

        rows.append({
            "auto_focus_area": area,
            "auto_tech_cluster": tech,
            "last_share_patent": last_share,
            "d1_recent": d1_recent,
            "d2_recent": d2_recent,
            "tech_stage": stage
        })

    return pd.DataFrame(rows)



# kullanım:
df_transition = forecast_paper_to_patent_shift_d1d2(df_ts, window=6, min_total_docs=10)
df_transition.sort_values("tech_stage").head(40)




In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from statsmodels.nonparametric.smoothers_lowess import lowess

# === 0) DATA LOAD & df_ts hazırlığı (gerekirse) ===
DATA_DIR = Path("../../01_data") / "predictive_model"
df = pd.read_parquet(DATA_DIR / "df_auto_corpus_area_tech.parquet")

# sadece paper + patent
df = df[df["source_type"].isin(["paper", "patent"])].copy()
df = df.dropna(subset=["year", "month"])

df["year"] = df["year"].astype(int)
df["month"] = df["month"].astype(int)
df["date"] = pd.to_datetime(dict(year=df["year"], month=df["month"], day=1))

# aylık toplam doküman
g = df.groupby(
    ["auto_focus_area", "auto_tech_cluster", "date", "source_type"]
).size().reset_index(name="n")

pivot = g.pivot_table(
    index=["auto_focus_area", "auto_tech_cluster", "date"],
    columns="source_type",
    values="n",
    fill_value=0
).reset_index()

pivot["n_total"] = pivot.get("paper", 0) + pivot.get("patent", 0)

df_ts = pivot[["auto_focus_area", "auto_tech_cluster", "date", "n_total"]].copy()

# === 1) LOWESS + türev grafikleri ===
OUT_DIR = Path("tech_lowess_hype")
OUT_DIR.mkdir(exist_ok=True)

FRAC = 0.3          # LOWESS bant genişliği (0.2–0.4 arası oynatabilirsin)
MIN_POINTS = 10     # minimum ay sayısı

for (area, tech), g in df_ts.groupby(["auto_focus_area", "auto_tech_cluster"]):
    g = g.sort_values("date")

    if len(g) < MIN_POINTS:
        continue

    # zaman ekseni (sadece index)
    t = np.arange(len(g), dtype=float)

    # log-kümülatif doküman
    cum_docs = g["n_total"].cumsum().values.astype(float)
    y = np.log1p(cum_docs)  # log(1 + cum)

    # LOWESS smoothing
    y_smooth = lowess(y, t, frac=FRAC, return_sorted=False)

    # türevler
    d1 = np.diff(y_smooth)
    d2 = np.diff(d1)

    t0 = g["date"].values
    t1 = g["date"].iloc[1:].values
    t2 = g["date"].iloc[2:].values

    # === cumulative (S-curve) ===
    plt.figure()
    plt.plot(t0, np.expm1(y_smooth))  # log'tan geri çevir
    plt.title(f"{tech} – Cumulative docs (LOWESS)")
    plt.xlabel("Date")
    plt.ylabel("Cumulative docs (smoothed)")
    f0 = OUT_DIR / f"{area}_{tech}_cum_lowess.png"
    plt.savefig(f0)
    plt.close()

    # === 1. türev ===
    plt.figure()
    plt.plot(t1, d1)
    plt.title(f"{tech} – 1st Derivative (LOWESS)")
    plt.xlabel("Date")
    plt.ylabel("Δ log-cum docs")
    f1 = OUT_DIR / f"{area}_{tech}_d1_lowess.png"
    plt.savefig(f1)
    plt.close()

    # === 2. türev ===
    plt.figure()
    plt.plot(t2, d2)
    plt.title(f"{tech} – 2nd Derivative (LOWESS)")
    plt.xlabel("Date")
    plt.ylabel("Δ² log-cum docs")
    f2 = OUT_DIR / f"{area}_{tech}_d2_lowess.png"
    plt.savefig(f2)
    plt.close()

print("✅ LOWESS bazlı cumulative + 1. / 2. türev hype grafikleri:", OUT_DIR)


In [None]:
# === Cell 10: feature table (df_feat) – time series özetleri + growth + stage ===

rows = []
for (area, tech), g in df_ts.groupby(["auto_focus_area", "auto_tech_cluster"]):
    g = g.sort_values("date")
    n_months = len(g)
    if n_months == 0:
        continue

    n_years = n_months / 12.0
    total_docs = g["n_total"].sum()
    mean_docs = g["n_total"].mean()
    max_docs = g["n_total"].max()
    last_docs = g["n_total"].iloc[-1]

    mean_share_pat = g["share_patent"].mean()
    std_share_pat  = g["share_patent"].std(ddof=0)

    rows.append({
        "auto_focus_area": area,
        "auto_tech_cluster": tech,
        "n_months": n_months,
        "n_years": n_years,
        "total_docs": total_docs,
        "mean_docs": mean_docs,
        "max_docs": max_docs,
        "last_docs": last_docs,
        "mean_share_patent_ts": mean_share_pat,
        "std_share_patent_ts": std_share_pat if pd.notna(std_share_pat) else 0.0,
    })

df_feat = pd.DataFrame(rows)

df_feat = df_feat.merge(
    df_growth[
        ["auto_focus_area", "auto_tech_cluster",
         "growth_slope_n_docs", "n_docs_last"]
    ],
    on=["auto_focus_area", "auto_tech_cluster"],
    how="left",
)

df_feat = df_feat.merge(
    df_transition[
        ["auto_focus_area",
         "auto_tech_cluster",
         "last_share_patent",
         "d1_recent",
         "d2_recent",
         "tech_stage",
        ]
    ],
    on=["auto_focus_area", "auto_tech_cluster"],
    how="left",
)

df_feat.head()



In [None]:
# === Cell 11: "Application pattern" centroid + distance ===

num_cols = [
    "n_months",
    "n_years",
    "total_docs",
    "mean_docs",
    "max_docs",
    "last_docs",
    "mean_share_patent_ts",
    "std_share_patent_ts",
    "growth_slope_n_docs",
    "n_docs_last",
    "last_share_patent",
    "forecast_share_patent_mean",
    "delta_share_patent",
]

df_num = df_feat[num_cols].fillna(0.0)

mu = df_num.mean()
sigma = df_num.std(ddof=0).replace(0, 1.0)
df_z = (df_num - mu) / sigma

mask_app_rule = df_feat["tech_stage"] == "Application Now"
df_app_z = df_z[mask_app_rule]

from numpy.linalg import norm

centroid_app = df_app_z.mean()
distances = norm(df_z.values - centroid_app.values, axis=1)
df_feat["dist_to_app_pattern"] = distances

d_app = distances[mask_app_rule]
q75 = np.quantile(d_app, 0.75)

df_feat["application_like_by_pattern"] = df_feat["dist_to_app_pattern"] <= q75

df_feat.sort_values("dist_to_app_pattern").head(20)


In [None]:
# === Cell 12: adjusted tech_stage (pattern tabakası) ===

df_feat["tech_stage_adjusted"] = df_feat["tech_stage"]

mask_candidate = (
    df_feat["application_like_by_pattern"]
    & (df_feat["last_share_patent"] > 0.4)
    & (df_feat["n_years"] >= 3)
)

df_feat.loc[mask_candidate, "tech_stage_adjusted"] = "Application Now"

df_feat[
    [
        "auto_focus_area",
        "auto_tech_cluster",
        "tech_stage",
        "tech_stage_adjusted",
        "last_share_patent",
        "delta_share_patent",
        "n_years",
        "dist_to_app_pattern",
    ]
].sort_values("dist_to_app_pattern").head(40)


In [None]:
# === Cell 13: save outputs ===

df_ts.to_parquet(DATA_DIR / "area_tech_timeseries.parquet", index=False)
df_growth.to_parquet(DATA_DIR / "forecast_area_tech_growth.parquet", index=False)
df_transition.to_parquet(DATA_DIR / "forecast_area_tech_transition.parquet", index=False)
df_feat.to_parquet(DATA_DIR / "area_tech_features_with_stage.parquet", index=False)

print("Saved:")
print(DATA_DIR / "area_tech_timeseries.parquet")
print(DATA_DIR / "forecast_area_tech_growth.parquet")
print(DATA_DIR / "forecast_area_tech_transition.parquet")
print(DATA_DIR / "area_tech_features_with_stage.parquet")


In [None]:
from predictive_components.analytics import (
    load_area_tech_ts,
    get_fastest_growing_topics,
    get_transitioning_technologies,
    get_likely_to_mature_next_year,
    plot_simple_timeseries,
)


In [None]:
df_ts = load_area_tech_ts()
df_ts.head()


In [None]:
df_fast = get_fastest_growing_topics(df_ts, top_n=20)
df_fast


In [None]:
df_transition = get_transitioning_technologies(df_ts, horizon=12)
df_transition


In [None]:
from predictive_components.analytics import plot_simple_timeseries
import matplotlib.pyplot as plt

fig = plot_simple_timeseries(
    df_ts=df_ts,
    area="Communication_Technologies",
    tech="5G",
    value_col="n_total"
)

plt.show()

In [None]:
df_mature_next = get_likely_to_mature_next_year(df_ts, horizon=12)
df_mature_next
