In [None]:
# === Cell 1: imports & paths ===
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

DATA_DIR = Path("../../01_data") / "predictive_model"
CORPUS_PATH = DATA_DIR / "df_auto_corpus_area_tech_free_clean_baai.parquet"



In [None]:
# === Cell 2: load + basic filters (paper/patent + date) ===

df = pd.read_parquet(CORPUS_PATH)

# sadece paper + patent
df = df[df["source_type"].isin(["paper", "patent"])].copy()

# year / month NA'leri at
df = df.dropna(subset=["year", "month"])

df["year"] = df["year"].astype(int)
df["month"] = df["month"].astype(int)

# ayın ilk günü olacak şekilde tarih
df["date"] = pd.to_datetime(
    dict(year=df["year"], month=df["month"], day=1)
)

df.head()



In [None]:
# === Cell 3: area-tech-date level time series (df_ts) ===

g = df.groupby(
    ["auto_focus_area", "auto_tech_cluster", "date", "source_type"]
).size().reset_index(name="n")

pivot = g.pivot_table(
    index=["auto_focus_area", "auto_tech_cluster", "date"],
    columns="source_type",
    values="n",
    fill_value=0
).reset_index()

pivot.columns.name = None

for col in ["paper", "patent"]:
    if col not in pivot.columns:
        pivot[col] = 0

pivot["n_total"] = pivot["paper"] + pivot["patent"]
pivot["share_paper"] = np.where(pivot["n_total"] > 0, pivot["paper"] / pivot["n_total"], 0.0)
pivot["share_patent"] = np.where(pivot["n_total"] > 0, pivot["patent"] / pivot["n_total"], 0.0)

df_ts = pivot.copy()
df_ts.head()


In [None]:
# === Cell 13: save outputs ===

df_ts.to_parquet(DATA_DIR / "area_tech_timeseries.parquet", index=False)

print(DATA_DIR / "area_tech_timeseries.parquet")



In [None]:
from predictive_components.analytics import (
    load_area_tech_ts,
    get_fastest_growing_topics,
    get_transitioning_technologies,
    get_likely_to_mature_next_year,
    plot_simple_timeseries,
)


In [None]:
df_ts = load_area_tech_ts()
df_ts.head()
