In [None]:
# Google Trends (daily, 2020 -> today) for BTC & crypto — robust against 429

from pytrends.request import TrendReq
import pytrends.exceptions as pte
import pandas as pd
from datetime import datetime, timedelta, timezone
from pathlib import Path
import time, random

# -------- config --------
KEYWORDS    = ["Bitcoin", "BTC", "crypto"]
START_DATE  = pd.Timestamp("2020-01-01", tz="UTC")
END_DATE    = pd.Timestamp(datetime.now(timezone.utc).date(), tz="UTC")
CHUNK_DAYS  = 150          # keep ≤270 for daily; smaller chunk = fewer 429s
BASE_PAUSE  = 3.5          # base delay between requests (seconds)
MAX_RETRIES = 6            # per request/window
OUT_DIR     = Path("../data/google-trends/")
OUT_DIR.mkdir(parents=True, exist_ok=True)
# ------------------------

def new_client():
    # Disable pytrends' urllib3 Retry to avoid the method_whitelist arg,
    # we handle retries/backoff ourselves in _iot_with_retry().
    return TrendReq(
        hl="en-US",
        tz=0,
        retries=0,            # <-- was 2
        backoff_factor=0.0,   # <-- was 0.4
        timeout=(5, 60),
        requests_args={"headers": {"User-Agent": "Mozilla/5.0"}}
    )

pytrends = new_client()

def _pause(extra=0.0):
    # small random jitter helps avoid synchronized limits
    time.sleep(BASE_PAUSE + extra + random.uniform(0.3, 1.0))

def _iot_with_retry(client, timeframe, kw):
    """interest_over_time with retries, exponential backoff, and session refresh on 429."""
    delay = 1.0
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            client.build_payload([kw], timeframe=timeframe, geo="")
            df = client.interest_over_time()
            return df
        except pte.TooManyRequestsError:
            # backoff + refresh cookie/session
            wait = delay + random.uniform(0.25, 0.75)
            print(f"[429] backoff {attempt}/{MAX_RETRIES} — sleeping {wait:.1f}s")
            time.sleep(wait)
            delay *= 1.8
            # re-create client every couple attempts to rotate cookies
            if attempt % 2 == 0:
                globals()["pytrends"] = new_client()
                client = globals()["pytrends"]
        except pte.ResponseError as e:
            # transient HTTP issues — brief backoff then retry
            wait = delay
            print(f"[WARN] ResponseError attempt {attempt}: {e}; sleep {wait:.1f}s")
            time.sleep(wait)
            delay *= 1.8
    # final try; if it still fails, return empty frame (caller will handle)
    try:
        client.build_payload([kw], timeframe=timeframe, geo="")
        return client.interest_over_time()
    except Exception as e:
        print(f"[FAIL] giving up on timeframe {timeframe} for {kw}: {e}")
        return pd.DataFrame()

def fetch_daily_keyword(keyword: str, start: pd.Timestamp, end: pd.Timestamp) -> pd.Series:
    """
    Pull daily interest for `keyword` using overlapping windows and stitch with scaling.
    Returns a Series indexed by UTC date, name=keyword.
    """
    series_list = []
    t0 = start

    while t0 < end:
        t1 = min(t0 + timedelta(days=CHUNK_DAYS), end)
        tf = f"{t0.date()} {t1.date()}"

        df = _iot_with_retry(pytrends, tf, keyword)
        if df.empty or keyword not in df.columns:
            print(f"[WARN] No data for window {tf} / {keyword}")
            t0 = t1
            _pause()
            continue

        s = df[keyword].copy()
        # ensure UTC-normalized daily index
        s.index = pd.to_datetime(s.index).tz_localize("UTC", nonexistent="shift_forward", ambiguous="NaT").normalize()
        series_list.append(s)

        t0 = t1
        _pause()

    if not series_list:
        return pd.Series(name=keyword, dtype="float64")

    # stitch windows using last-overlap scaling
    stitched = series_list[0]
    for nxt in series_list[1:]:
        overlap = stitched.index.intersection(nxt.index)
        if len(overlap) > 0:
            anchor = overlap[-1]
            a, b = stitched.loc[anchor], nxt.loc[anchor]
            scale = (a / b) if (b not in (0, None) and b != 0) else 1.0
        else:
            scale = 1.0
            if len(stitched) and len(nxt):
                a, b = stitched.iloc[-1], nxt.iloc[0]
                if b not in (0, None) and b != 0:
                    scale = a / b

        nxt_scaled = nxt * scale
        stitched = pd.concat([stitched[~stitched.index.isin(nxt_scaled.index)], nxt_scaled]).sort_index()

    stitched = stitched.loc[(stitched.index >= START_DATE) & (stitched.index <= END_DATE)]
    stitched.name = keyword
    return stitched.astype("float64")

# ---- fetch all keywords ----
all_series = []
for kw in KEYWORDS:
    print(f"Fetching daily trends for: {kw}")
    s = fetch_daily_keyword(kw, START_DATE, END_DATE)
    if s.empty:
        print(f"[WARN] Empty series for {kw}")
        continue
    s.to_frame().to_csv(OUT_DIR / f"google_trends_{kw.lower().replace(' ', '_')}_daily.csv")
    all_series.append(s)

# ---- combined wide CSV ----
if all_series:
    wide = pd.concat(all_series, axis=1).sort_index()
    wide.index.name = "date"
    out_path = OUT_DIR / "google_trends_btc_crypto_daily_wide.csv"
    wide.to_csv(out_path)
    print(f"Saved combined wide CSV -> {out_path} ({len(wide)} rows, {wide.shape[1]} cols)")
    display(wide.tail())
else:
    print("No series fetched.")