# Project 3 — API ETL (Raw Snapshot → Clean → Report)

**Audience:** Technical team

This notebook demonstrates a lightweight extraction pipeline:
- Extract from a public API
- Persist a raw snapshot for reproducibility (raw zone)
- Transform/clean into an analysis-ready DataFrame
- Produce a small report

Includes a local fallback dataset to keep the notebook runnable even without internet access.

In [ ]:
import requests
import pandas as pd
import json
from pathlib import Path
from datetime import datetime

In [ ]:
API_URL = "https://jsonplaceholder.typicode.com/posts"
RAW_DIR = Path("raw_zone")
RAW_DIR.mkdir(exist_ok=True)

def fetch_posts():
    """Extract step: fetch from API with a safe fallback."""
    print("[EXTRACT] Fetching posts...")
    try:
        resp = requests.get(API_URL, timeout=15)
        resp.raise_for_status()
        data = resp.json()
        print("[EXTRACT] Records:", len(data))
        return data
    except Exception as e:
        print("[EXTRACT] API not reachable, using fallback sample. Error:", str(e))
        return [
            {"userId": 1, "id": 1, "title": "sample title", "body": "sample body"},
            {"userId": 1, "id": 2, "title": "another title", "body": "another body"},
        ]

def save_raw_snapshot(data):
    """Persist raw payload for reproducibility and auditability."""
    ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    path = RAW_DIR / f"posts_{ts}.json"
    path.write_text(json.dumps(data, indent=2), encoding="utf-8")
    print("[RAW] Snapshot saved:", path)
    return path


## Transform
Add derived features and apply basic quality checks.

In [ ]:
def transform_posts(data):
    print("[TRANSFORM] Building dataframe...")
    df = pd.DataFrame(data)
    print("[TRANSFORM] Shape:", df.shape)

    # Clean strings + derive simple features
    df["title"] = df["title"].astype(str).str.strip()
    df["body"] = df["body"].astype(str).str.strip()
    df["title_len"] = df["title"].str.len()
    df["body_len"] = df["body"].str.len()

    # Basic data quality checks
    if df["id"].isna().any():
        print("[DQ] WARNING: null ids detected")
    if not df["id"].is_unique:
        print("[DQ] WARNING: duplicate ids detected")
    else:
        print("[DQ] id uniqueness OK")

    return df


## Report
Aggregate by `userId` as a simple decision-support output.

In [ ]:
def build_report(df: pd.DataFrame) -> pd.DataFrame:
    print("[REPORT] Building aggregated metrics...")
    rep = (
        df.groupby("userId")
          .agg(posts=("id", "count"),
               avg_title_len=("title_len", "mean"),
               avg_body_len=("body_len", "mean"))
          .reset_index()
          .sort_values("posts", ascending=False)
    )
    return rep

def run_api_etl():
    data = fetch_posts()
    raw_path = save_raw_snapshot(data)

    df = transform_posts(data)
    rep = build_report(df)

    print("[REPORT] Top users:")
    display(rep.head(10))

    print("[DONE] Raw snapshot path:", raw_path)
    return df, rep

df, rep = run_api_etl()
