# 4. Employment and Health Expenditure

## 4.1. Data Labeling

In [30]:
# --- 데이터 로드 ---
metadata = pd.read_excel("/Users/yujin.sophia.kim/Desktop/data/WDI/archive/WDI_Indicators_Metadata.xlsx")
df = pd.read_csv("/Users/yujin.sophia.kim/Desktop/data/WDI/archive/WDI_Indicators_MainData.csv")

In [31]:
# --- 선택 & 리네임 (CHE 추가, 콤마 고침) ---
df = df[[
    "Country Name", "Time",
    "Current health expenditure (% of GDP) [SH.XPD.CHEX.GD.ZS]",
    "Domestic general government health expenditure (% of GDP) [SH.XPD.GHED.GD.ZS]",
    "Unemployment, total (% of total labor force) (modeled ILO estimate) [SL.UEM.TOTL.ZS]",
    "Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate) [SL.UEM.1524.ZS]",
    "Vulnerable employment, total (% of total employment) (modeled ILO estimate) [SL.EMP.VULN.ZS]", 
    "Part time employment, total (% of total employment) [SL.TLF.PART.ZS]",
    "Self-employed, total (% of total employment) (modeled ILO estimate) [SL.EMP.SELF.ZS]",
    "Wage and salaried workers, total (% of total employment) (modeled ILO estimate) [SL.EMP.WORK.ZS]"
]].copy()

df = df.rename(columns={
    "Current health expenditure (% of GDP) [SH.XPD.CHEX.GD.ZS]": "GDP_Total_HealthExp",
    "Domestic general government health expenditure (% of GDP) [SH.XPD.GHED.GD.ZS]": "GDP_GovHealthExp",
    "Unemployment, total (% of total labor force) (modeled ILO estimate) [SL.UEM.TOTL.ZS]": "JOB_Unemp_Total",
    "Unemployment, youth total (% of total labor force ages 15-24) (modeled ILO estimate) [SL.UEM.1524.ZS]": "JOB_Unemp_Youth_Total",
    "Vulnerable employment, total (% of total employment) (modeled ILO estimate) [SL.EMP.VULN.ZS]": "JOB_VulnerableEmployment_Total", 
    "Part time employment, total (% of total employment) [SL.TLF.PART.ZS]": "JOB_Part_Time_Total",
    "Self-employed, total (% of total employment) (modeled ILO estimate) [SL.EMP.SELF.ZS]": "JOB_Self_employed_Total",
    "Wage and salaried workers, total (% of total employment) (modeled ILO estimate) [SL.EMP.WORK.ZS]": "JOB_Contracters_Total",
})

# --- 숫자형 변환 ---
for col in ["GDP_Total_HealthExp","GDP_GovHealthExp","JOB_Unemp_Total","JOB_Unemp_Youth_Total",
            "JOB_VulnerableEmployment_Total","JOB_Part_Time_Total","JOB_Self_employed_Total","JOB_Contracters_Total"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")
df["Time"] = pd.to_numeric(df["Time"], errors="coerce").astype("Int64")

# --- 정부보건지출의 CHE 대비 비중(%) ---
df["GovShare_CHE_pct"] = 100 * df["GDP_GovHealthExp"] / df["GDP_Total_HealthExp"]

# --- 산점도용 집계 (국가별 평균) ---
govshare_avg = (df.groupby("Country Name", as_index=False)[
    ["GDP_GovHealthExp","JOB_Unemp_Total","JOB_Unemp_Youth_Total", "JOB_Self_employed_Total",
     "JOB_VulnerableEmployment_Total","GovShare_CHE_pct"]
].mean())

## 4.2. Regression

In [32]:
# --- 1) Unemployment, Total ---

country_avg = (
    df.groupby("Country Name")[["GDP_GovHealthExp", "JOB_Unemp_Total"]]
      .mean(numeric_only=True)
      .dropna()
      .reset_index()
)
corr_val = country_avg["GDP_GovHealthExp"].corr(country_avg["JOB_Unemp_Total"])
fig_scatter = px.scatter(
    country_avg,
    x="GDP_GovHealthExp",
    y="JOB_Unemp_Total",
    text="Country Name",         # <- 인덱스 배열 대신 컬럼명 사용
    trendline="ols",             # statsmodels 미설치면 이 줄 제거
    title=f"1. Government Health Expenditure (% of GDP) VS Total Unemployment (% of total labor force) — r = {corr_val:.2f}",
    labels={
        "GDP_GovHealthExp": "Government Health Expenditure",
        "JOB_Unemp_Total": "Total Unemployment"
    }
)
fig_scatter.update_traces(textposition="top center")
fig_scatter.show()

# --- 2) Youth Unemployment ----

country_avg = (
    df.groupby("Country Name")[["GDP_GovHealthExp", "JOB_Unemp_Youth_Total"]]
      .mean(numeric_only=True)
      .dropna()
      .reset_index()
)
corr_val = country_avg["GDP_GovHealthExp"].corr(country_avg["JOB_Unemp_Youth_Total"])
fig_scatter = px.scatter(
    country_avg,
    x="GDP_GovHealthExp",
    y="JOB_Unemp_Youth_Total",
    text="Country Name",         # <- 인덱스 배열 대신 컬럼명 사용
    trendline="ols",             # statsmodels 미설치면 이 줄 제거
    title=f"2. Government Health Expenditure (% of GDP) VS Youth Unemployment (% of total labor force) — r = {corr_val:.2f}",
    labels={
        "GDP_GovHealthExp": "Government Health Expenditure",
        "JOB_Unemp_Youth_Total": "Youth Unemployment (Ages 15-24)"
    }
)
fig_scatter.update_traces(textposition="top center")
fig_scatter.show()

# --- 3) VulnerableEmployment ---

country_avg = (
    df.groupby("Country Name")[["GDP_GovHealthExp", "JOB_VulnerableEmployment_Total"]]
      .mean(numeric_only=True)
      .dropna()
      .reset_index()
)
corr_val = country_avg["GDP_GovHealthExp"].corr(country_avg["JOB_VulnerableEmployment_Total"])
fig_scatter = px.scatter(
    country_avg,
    x="GDP_GovHealthExp",
    y="JOB_VulnerableEmployment_Total",
    text="Country Name",         # <- 인덱스 배열 대신 컬럼명 사용
    trendline="ols",             # statsmodels 미설치면 이 줄 제거
    title=f"3. Government Health Expenditure (% of GDP) VS Vulnerable Employment (% of total employment) — r = {corr_val:.2f}",
    labels={
        "GDP_GovHealthExp": "Government Health Expenditure",
        "JOB_VulnerableEmployment_Total": "Vulnerable employees"
    }
)
fig_scatter.update_traces(textposition="top center")
fig_scatter.show()

# --- 4) Part_Time_Total ---

country_avg = (
    df.groupby("Country Name")[["GDP_GovHealthExp", "JOB_Part_Time_Total"]]
      .mean(numeric_only=True)
      .dropna()
      .reset_index()
)
corr_val = country_avg["GDP_GovHealthExp"].corr(country_avg["JOB_Part_Time_Total"])
fig_scatter = px.scatter(
    country_avg,
    x="GDP_GovHealthExp",
    y="JOB_Part_Time_Total",
    text="Country Name",         # <- 인덱스 배열 대신 컬럼명 사용
    trendline="ols",             # statsmodels 미설치면 이 줄 제거
    title=f"4. Government Health Expenditure (% of GDP) VS Part Time Employment (% of total employment) — r = {corr_val:.2f}",
    labels={
        "GDP_GovHealthExp": "Government Health Expenditure",
        "JOB_Part_Time_Total": "Part time employmees"
    }
)
fig_scatter.update_traces(textposition="top center")
fig_scatter.show()

# --- 5) JOB_Self_employed_Total ---

country_avg = (
    df.groupby("Country Name")[["GDP_GovHealthExp", "JOB_Self_employed_Total"]]
      .mean(numeric_only=True)
      .dropna()
      .reset_index()
)
corr_val = country_avg["GDP_GovHealthExp"].corr(country_avg["JOB_Self_employed_Total"])
fig_scatter = px.scatter(
    country_avg,
    x="GDP_GovHealthExp",
    y="JOB_Self_employed_Total",
    text="Country Name",         # <- 인덱스 배열 대신 컬럼명 사용
    trendline="ols",             # statsmodels 미설치면 이 줄 제거
    title=f"5. Government Health Expenditure (% of GDP) VS Self_Employed (% of total employment) — r = {corr_val:.2f}",
    labels={
        "GDP_GovHealthExp": "Government Health Expenditure",
        "JOB_Self_employed_Total": "Self_employed"
    }
)
fig_scatter.update_traces(textposition="top center")
fig_scatter.show()

# --- 6) JOB_Contracters_Total ---

country_avg = (
    df.groupby("Country Name")[["GDP_GovHealthExp", "JOB_Contracters_Total"]]
      .mean(numeric_only=True)
      .dropna()
      .reset_index()
)
corr_val = country_avg["GDP_GovHealthExp"].corr(country_avg["JOB_Contracters_Total"])
fig_scatter = px.scatter(
    country_avg,
    x="GDP_GovHealthExp",
    y="JOB_Contracters_Total",
    text="Country Name",         # <- 인덱스 배열 대신 컬럼명 사용
    trendline="ols",             # statsmodels 미설치면 이 줄 제거
    title=f"6. Government Health Expenditure (% of GDP) VS Wage and Salaried Workers (% of total employment) — r = {corr_val:.2f}",
    labels={
        "GDP_GovHealthExp": "Government Health Expenditure",
        "JOB_Contracters_Total": "Wage and salaried employees"
    }
)
fig_scatter.update_traces(textposition="top center")
fig_scatter.show()


## 4.3. Time Series Analysis

In [33]:
# 1) 기간 필터
df_ts = df[df["Time"].between(2011, 2021)].copy()

# 2) 시계열 라인 (변수별로 간단 반복)
plots = [
    ("JOB_Part_Time_Total",         "Part-time Employment (%)",              "Part-time Employment (%)"),
    ("JOB_VulnerableEmployment_Total","Vulnerable Employment (%)",           "Vulnerable Employment (%)"),
    ("JOB_Unemp_Total",             "Unemployment Rate (%)",                 "Unemployment (%)"),
    ("JOB_Unemp_Youth_Total",       "Youth Unemployment Rate (%)",           "Youth Unemployment (%)"),
    ("JOB_Self_employed_Total",     "Self_employeed Rate (%)",               "Self_Employeed (%)")
]

for col, title, ylab in plots:
    if col in df_ts.columns:
        fig = px.line(
            df_ts.sort_values(["Country Name","Time"]),
            x="Time", y=col, color="Country Name", markers=True,
            title=f"{title}, 2011–2021",
            labels={col: ylab}
        )
        fig.show()

In [34]:
# 누적(≥) 지표 약칭
Unemp   = "JOB_Unemp_Total"         
Unemp_Youth  = "JOB_Unemp_Youth_Total" 
Vulnerable  = "JOB_VulnerableEmployment_Total" 
Part = "JOB_Part_Time_Total"     
Self_employed  = "JOB_Self_employed_Total"      
Contracters = "JOB_Contracters_Total"

# 1) Contracters
df["JOB_Contracters_Total"] = df[Contracters]

# 2) Self_employeed
df["JOB_Self_employed_Total"] = df[Self_employed]

# 3) Part_time
df["JOB_Part_Time_Total"] = df[Part] 

# 4) Vulnerable
df["JOB_VulnerableEmployment_Total"] = df[Vulnerable]

# 5) Youth Unemployment
df["JOB_Unemp_Youth_Total"] = df[Unemp_Youth]

# 6) Unemployment
df["JOB_Unemp_Total"] = df[Unemp]

In [35]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ---- 설정 ----
YR0, YR1 = 2011, 2021
UNEMP = "JOB_Unemp_Total"                  # % of labor force
PART  = "JOB_Part_Time_Total"              # % of employment
VULN  = "JOB_VulnerableEmployment_Total"   # % of employment
SELF  = "JOB_Self_employed_Total"          # % of employment
WAGE  = "JOB_Contracters_Total"            # % of employment (임금근로)

need = [UNEMP, PART, VULN, SELF, WAGE]
present = [c for c in need if c in df.columns]
assert UNEMP in present, "실업률(JOB_Unemp_Total)이 필요합니다."

# ---- 기간 필터 & 타입 보정 ----
df_ts = df[df["Time"].between(YR0, YR1)].copy()
df_ts["Time"] = pd.to_numeric(df_ts["Time"], errors="coerce")
for c in present:
    df_ts[c] = pd.to_numeric(df_ts[c], errors="coerce").clip(0, 100)

# ---- Employment rate (= 100 - Unemployment) [of labor force] ----
df_ts["EMP_Rate_LF"] = (100 - df_ts[UNEMP]).clip(0, 100)

# ---- 연도별 글로벌 평균(국가 단순 평균) ----
use_means = ["EMP_Rate_LF", UNEMP] + [c for c in [PART, VULN, SELF, WAGE] if c in df_ts.columns]
yearly = df_ts.groupby("Time")[use_means].mean().reset_index()

# =========================
# A) 2011=100 지수화 (상대변화)
# =========================
pretty = {
    "EMP_Rate_LF": "Employment rate (LF)",
    UNEMP:         "Unemployment (LF)",
    PART:          "Part-time (of employment)",
    VULN:          "Vulnerable (of employment)",
    SELF:          "Self-employed (of employment)",
    WAGE:          "Wage & salaried (of employment)",
}

tmp = yearly.rename(columns={k:v for k,v in pretty.items() if k in yearly.columns})
long = tmp.melt(id_vars="Time", var_name="Indicator", value_name="Value").dropna()

base = (long[long["Time"] == YR0]
        .drop_duplicates("Indicator")[["Indicator","Value"]]
        .rename(columns={"Value":"Base"}))
long = long.merge(base, on="Indicator", how="left")
long["Index2011"] = 100 * long["Value"] / long["Base"]

fig_idx = px.line(
    long.sort_values(["Indicator","Time"]),
    x="Time", y="Index2011", color="Indicator", markers=True,
    title="Global Means — Employment & Composition (Indexed to 2011=100)",
    labels={"Index2011":"Index (2011=100)", "Indicator":""}
)
fig_idx.update_layout(legend=dict(orientation="v", x=1.02, xanchor="left", y=1, yanchor="top"),
                      margin=dict(r=200))
fig_idx.show()