# 3. Economic Development and Health Expenditure

## 3.1. Data Labeling

In [26]:
# --- 데이터 로드 ---
metadata = pd.read_excel("/Users/yujin.sophia.kim/Desktop/data/WDI/archive/WDI_Indicators_Metadata.xlsx")
df = pd.read_csv("/Users/yujin.sophia.kim/Desktop/data/WDI/archive/WDI_Indicators_MainData.csv")

In [27]:
# --- 선택 & 리네임 ---
df = df[[
    "Country Name", "Time",
    "Current health expenditure (% of GDP) [SH.XPD.CHEX.GD.ZS]",
    "Domestic general government health expenditure (% of GDP) [SH.XPD.GHED.GD.ZS]"
]].copy()

df = df.rename(columns={
    "Current health expenditure (% of GDP) [SH.XPD.CHEX.GD.ZS]": "GDP_Total_HealthExp",
    "Domestic general government health expenditure (% of GDP) [SH.XPD.GHED.GD.ZS]": "GDP_GovHealthExp"
})

# 숫자형 변환
for col in ["GDP_Total_HealthExp", "GDP_GovHealthExp"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df["Time"] = pd.to_numeric(df["Time"], errors="coerce").astype("Int64")

## 3.2. Regressions

In [28]:
# --- 라인 그래프들 ---
fig1 = px.line(
    df, x="Time", y="GDP_Total_HealthExp", color="Country Name",
    title="Total Health Expenditure (% of GDP) by Country",
    labels={"GDP_Total_HealthExp": "% of GDP", "Time": "Year"}
)
fig1.show()

fig2 = px.line(
    df, x="Time", y="GDP_GovHealthExp", color="Country Name",
    title="Government Health Expenditure (% of GDP) by Country",
    labels={"GDP_GovHealthExp": "% of GDP", "Time": "Year"}
)
fig2.show()


# --- 스캐터: Total vs Gov Health Exp (국가 평균) ---
country_avg = (
    df.groupby("Country Name")[["GDP_Total_HealthExp", "GDP_GovHealthExp"]]
      .mean(numeric_only=True)
      .dropna()
      .reset_index()
)
corr_val = country_avg["GDP_Total_HealthExp"].corr(country_avg["GDP_GovHealthExp"])
fig_scatter = px.scatter(
    country_avg,
    x="GDP_Total_HealthExp",
    y="GDP_GovHealthExp",
    text="Country Name",         # <- 인덱스 배열 대신 컬럼명 사용
    trendline="ols",             # statsmodels 미설치면 이 줄 제거
    title=f"Total vs Government Health Expenditure (% of GDP) — r = {corr_val:.2f}",
    labels={
        "GDP_Total_HealthExp": "Total Health Expenditure (% of GDP)",
        "GDP_GovHealthExp": "Government Health Expenditure (% of GDP)"
    }
)
fig_scatter.update_traces(textposition="top center")
fig_scatter.show()



## 3.3. Time Series Analysis

In [29]:
# --- 0) 안전 리네임: 코드로 원본 컬럼 찾아 매핑 ---
def _ensure_col(df, code, target):
    if target not in df.columns:
        m = [c for c in df.columns if code in c]
        if m: df = df.rename(columns={m[0]: target})
    return df

df = _ensure_col(df, "SH.XPD.CHEX.GD.ZS",  "GDP_Total_HealthExp")
df = _ensure_col(df, "SH.XPD.GHED.GD.ZS",  "GDP_GovHealthExp")

missing = [c for c in ["Country Name","Time","GDP_Total_HealthExp","GDP_GovHealthExp"] if c not in df.columns]
if missing:
    raise KeyError(f"누락 컬럼: {missing}. 원본 컬럼명을 확인하세요.")

# --- 1) 기간 필터 + 타입 정리 ---
df = df.copy()
df["Time"] = pd.to_numeric(df["Time"], errors="coerce").astype("Int64")
df = df[df["Time"].between(2011, 2021)]

# --- 2) 파생: 정부 비중(% of CHE) ---
df["GovShare_CHE_pct"] = 100 * df["GDP_GovHealthExp"] / df["GDP_Total_HealthExp"]

base = df.sort_values(["Country Name","Time"])

# --- 3) 시계열 라인 그래프 ---
fig1 = px.line(base, x="Time", y="GDP_Total_HealthExp", color="Country Name",
               markers=True, title="Total Health Expenditure (% of GDP), 2011–2021",
               labels={"GDP_Total_HealthExp":"Total Health Expenditure (% of GDP)"})
fig1.show()

fig2 = px.line(base, x="Time", y="GDP_GovHealthExp", color="Country Name",
               markers=True, title="Government Health Expenditure (% of GDP), 2011–2021",
               labels={"GDP_GovHealthExp":"Government Health Expenditure (% of GDP)"})
fig2.show()

fig3 = px.line(base, x="Time", y="GovShare_CHE_pct", color="Country Name",
               markers=True, title="Government Share of Health Expenditure (%), 2011–2021",
               labels={"GovShare_CHE_pct":"Government share of HE (%)"})
fig3.show()
