# 00_extract_from_pdf.ipynb

## 目的
日本看護協会のPDF（病院看護実態調査）から、分析に使う **都道府県別の一次データ（raw CSV）** を抽出して保存する。

## 出力（data/raw）
- 日本看護協会_離職率_都道府県別_2023.csv
- 日本看護協会_夜勤72h超過率_都道府県別_2024.csv  
  - ※72h超過率（表50）＋ 三交代/二交代の月平均夜勤回数（表44/47）を **同一ファイルに統合**
- 日本看護協会_給与_都道府県別_2024.csv  
  - ※表34/37/40（平均税込給与総額）を統合


In [1]:
# ===============================
# Imports
# ===============================
from pathlib import Path

import pandas as pd
import pdfplumber


In [2]:
# ===============================
# Path Settings
# ===============================

# Notebookの実行場所に依存せず、プロジェクトルートを推定する
cwd = Path.cwd().resolve()

if cwd.name == "notebooks":
    BASE_DIR = cwd.parent
elif (cwd / "notebooks").exists() and (cwd / "data").exists():
    BASE_DIR = cwd
else:
    # 上位ディレクトリを探索して、data/ と notebooks/ が揃う場所をプロジェクトルートとみなす
    BASE_DIR = None
    for p in [cwd] + list(cwd.parents):
        if (p / "data").exists() and (p / "notebooks").exists():
            BASE_DIR = p
            break
    if BASE_DIR is None:
        BASE_DIR = cwd  # 最後の手段（この場合はパス確認ログで気づける）

RAW_DIR = BASE_DIR / "data" / "raw"
PROCESSED_DIR = BASE_DIR / "data" / "processed"

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# PDFファイルのパス（raw層に配置）
pdf_path = RAW_DIR / "日本看護協会_病院看護実態調査_離職率等_2025.pdf"
if not pdf_path.exists():
    raise FileNotFoundError(f"PDF not found: {pdf_path}")

print("✅ Path resolved")
print("BASE_DIR:", BASE_DIR)
print("RAW_DIR :", RAW_DIR)
print("PDF     :", pdf_path)


✅ Path resolved
BASE_DIR: /Users/hideomi.h/nurse-turnover-analysis
RAW_DIR : /Users/hideomi.h/nurse-turnover-analysis/data/raw
PDF     : /Users/hideomi.h/nurse-turnover-analysis/data/raw/日本看護協会_病院看護実態調査_離職率等_2025.pdf


In [3]:
# ===============================
# Page Settings（0-index = PDF表示ページ-1）
# ===============================
P_TURNOVER = 73    # PDF 74ページ（表8）離職率
P_NIGHT72P = 108   # PDF 109ページ（表50）夜勤72h超過率

P_SAL_T34  = 96    # PDF 97ページ（表34）新卒初任給（高卒＋3年課程）
P_SAL_T37  = 98    # PDF 99ページ（表37）新卒初任給（大卒）
P_SAL_T40  = 100   # PDF 101ページ（表40）勤続10年・非管理職 月額給与

P_AVG_T44  = 103   # PDF 104ページ（表44）三交代制 月平均夜勤回数
P_AVG_T47  = 105   # PDF 106ページ（表47）二交代制 月平均夜勤回数


In [4]:
# ===============================
# Helper Functions
# ===============================

EXCLUDE_PREF = {"計", "未回答", "無回答・不明", ""}

def extract_table(page_idx: int, table_idx: int = 0) -> pd.DataFrame:
    """指定ページの指定テーブルを抽出し、DataFrame化（全セル文字列＋strip）"""
    with pdfplumber.open(pdf_path) as pdf:
        tables = pdf.pages[page_idx].extract_tables()

    if not tables:
        raise ValueError(f"No tables found on page_idx={page_idx}")

    if table_idx >= len(tables):
        raise ValueError(f"table_idx={table_idx} not found on page_idx={page_idx}. tables_found={len(tables)}")

    t = tables[table_idx]
    df = pd.DataFrame(t).replace({None: ""})
    df = df.map(lambda x: str(x).strip())
    return df


def filter_pref(df: pd.DataFrame, col: str = "prefecture") -> pd.DataFrame:
    df[col] = df[col].astype(str).str.strip()
    return df[~df[col].isin(EXCLUDE_PREF)].copy()


def pct_to_float(x: str):
    """'11.3%' -> 11.3 / '' -> NA"""
    if x is None:
        return pd.NA
    s = str(x).strip().replace("％", "%")
    if s == "":
        return pd.NA
    s = s.replace("%", "")
    try:
        return float(s)
    except:
        return pd.NA


def yen_to_float(x: str):
    """'276,127' -> 276127 / '' -> NA"""
    if x is None:
        return pd.NA
    s = str(x).replace(",", "").strip()
    if s == "":
        return pd.NA
    try:
        return float(s)
    except:
        return pd.NA


def num_to_float(x: str):
    """'7.4' -> 7.4 / '' -> NA"""
    if x is None:
        return pd.NA
    s = str(x).replace(",", "").strip()
    if s == "":
        return pd.NA
    try:
        return float(s)
    except:
        return pd.NA


def extract_pref_value_by_col(
    page_idx: int,
    table_idx: int,
    row_start: int,
    pref_col: int,
    value_col: int,
    value_name: str,
    converter=None,
) -> pd.DataFrame:
    """都道府県列 + 値列を、列番号指定で安定抽出"""
    df = extract_table(page_idx, table_idx=table_idx)

    # 最低限の形チェック（想定より小さいなら即止める）
    if df.shape[0] <= row_start or df.shape[1] <= max(pref_col, value_col):
        raise ValueError(f"Table shape looks wrong on page_idx={page_idx}: shape={df.shape}")

    data = df.iloc[row_start:].copy()
    out = pd.DataFrame({
        "prefecture": data[pref_col],
        value_name: data[value_col],
    })

    out = filter_pref(out, "prefecture")

    if converter is not None:
        out[value_name] = out[value_name].map(converter)

    # 変換失敗の可視化（silent failure防止）
    na_rate = float(out[value_name].isna().mean())
    print(f"{value_name}: rows={len(out)}  NA_rate={na_rate:.3f}")
    return out


In [5]:
# ===============================
# Extract: Turnover（表8）
# ===============================
print("Extracting: turnover (table 8) ...")

df_t = extract_table(P_TURNOVER, table_idx=0)

# 既存の安定パターン：ヘッダ2行を落とし、列位置で拾う
data_t = df_t.iloc[2:].copy()

turnover = pd.DataFrame({
    "prefecture": data_t[0],
    "turnover_total": data_t[2],
    "turnover_new_grad": data_t[4],
    "turnover_experienced": data_t[6],
})

turnover = filter_pref(turnover, "prefecture")
for c in ["turnover_total", "turnover_new_grad", "turnover_experienced"]:
    turnover[c] = turnover[c].map(pct_to_float)

print("turnover shape:", turnover.shape)
display(turnover.head(3))


Extracting: turnover (table 8) ...
turnover shape: (47, 4)


Unnamed: 0,prefecture,turnover_total,turnover_new_grad,turnover_experienced
3,北海道,11.5,5.9,16.6
4,青森県,8.6,10.7,16.7
5,岩手県,6.8,7.8,19.1


In [6]:
# ===============================
# Extract: Night（表50 / 表44 / 表47）
#  - 表50: 夜勤72h超過率（%）
#  - 表44: 三交代制 月平均夜勤回数（回/月）
#  - 表47: 二交代制 月平均夜勤回数（回/月）
# ===============================

print("Extracting: night_72h_plus (table 50) ...")
df_n = extract_table(P_NIGHT72P, table_idx=0)
data_n = df_n.iloc[1:].copy()  # 表50はこの開始が安定（既存実装）
night_72h = pd.DataFrame({
    "prefecture": data_n[0],
    "night_shift_72h_plus": data_n[5],
})
night_72h = filter_pref(night_72h, "prefecture")
night_72h["night_shift_72h_plus"] = night_72h["night_shift_72h_plus"].map(pct_to_float)
print("night_72h shape:", night_72h.shape)


print("Extracting: night shift counts (table 44/47) ...")
# ユーザー提示の確定仕様（表示結果から確定）
night_3shift = extract_pref_value_by_col(
    page_idx=P_AVG_T44, table_idx=0, row_start=2, pref_col=0, value_col=2,
    value_name="night_shifts_per_month_three_shift",
    converter=num_to_float,
)

night_2shift = extract_pref_value_by_col(
    page_idx=P_AVG_T47, table_idx=0, row_start=2, pref_col=0, value_col=2,
    value_name="night_shifts_per_month_two_shift",
    converter=num_to_float,
)

# 同一ファイルに統合（要件）
night_master = (
    night_72h
    .merge(night_3shift, on="prefecture", how="left")
    .merge(night_2shift, on="prefecture", how="left")
)

print("night_master shape:", night_master.shape)
display(night_master.head(3))


Extracting: night_72h_plus (table 50) ...
night_72h shape: (47, 2)
Extracting: night shift counts (table 44/47) ...
night_shifts_per_month_three_shift: rows=47  NA_rate=0.000
night_shifts_per_month_two_shift: rows=47  NA_rate=0.000
night_master shape: (47, 4)


Unnamed: 0,prefecture,night_shift_72h_plus,night_shifts_per_month_three_shift,night_shifts_per_month_two_shift
0,北海道,36.7,7.8,4.6
1,青森県,36.5,7.7,4.8
2,岩手県,11.8,7.5,4.1


In [7]:
# ===============================
# Extract: Salary（表34 / 表37 / 表40）
#  - いずれも「平均税込給与総額（円）」 = 列3
# ===============================

print("Extracting: salary tables (t34/t37/t40) ...")

salary_t34 = extract_pref_value_by_col(
    page_idx=P_SAL_T34, table_idx=0, row_start=2, pref_col=0, value_col=3,
    value_name="starting_salary_nurse_diploma_monthly_yen",
    converter=yen_to_float,
)

salary_t37 = extract_pref_value_by_col(
    page_idx=P_SAL_T37, table_idx=0, row_start=2, pref_col=0, value_col=3,
    value_name="starting_salary_nurse_bachelor_monthly_yen",
    converter=yen_to_float,
)

salary_t40 = extract_pref_value_by_col(
    page_idx=P_SAL_T40, table_idx=0, row_start=2, pref_col=0, value_col=3,
    value_name="salary_nurse_10yr_non_manager_monthly_yen",
    converter=yen_to_float,
)

salary_master = (
    salary_t34
    .merge(salary_t37, on="prefecture", how="left")
    .merge(salary_t40, on="prefecture", how="left")
)

print("salary_master shape:", salary_master.shape)
display(salary_master.head(3))


Extracting: salary tables (t34/t37/t40) ...
starting_salary_nurse_diploma_monthly_yen: rows=47  NA_rate=0.000
starting_salary_nurse_bachelor_monthly_yen: rows=47  NA_rate=0.000
salary_nurse_10yr_non_manager_monthly_yen: rows=47  NA_rate=0.000
salary_master shape: (47, 4)


Unnamed: 0,prefecture,starting_salary_nurse_diploma_monthly_yen,starting_salary_nurse_bachelor_monthly_yen,salary_nurse_10yr_non_manager_monthly_yen
0,北海道,270231.0,276652.0,326530.0
1,青森県,270494.0,275476.0,314135.0
2,岩手県,268264.0,276926.0,322466.0


In [8]:
# ===============================
# Save to CSV（data/raw）
# ===============================

# 期待する列が揃っているか（要件チェック）
required_cols = {
    "night_shift_72h_plus",
    "night_shifts_per_month_three_shift",
    "night_shifts_per_month_two_shift",
}
missing_cols = [c for c in required_cols if c not in night_master.columns]
if missing_cols:
    raise RuntimeError(f"night_master is missing columns: {missing_cols}")

out_turnover = RAW_DIR / "日本看護協会_離職率_都道府県別_2023.csv"
out_night    = RAW_DIR / "日本看護協会_夜勤72h超過率_都道府県別_2024.csv"
out_salary   = RAW_DIR / "日本看護協会_給与_都道府県別_2024.csv"

turnover.to_csv(out_turnover, index=False, encoding="utf-8-sig")
night_master.to_csv(out_night, index=False, encoding="utf-8-sig")
salary_master.to_csv(out_salary, index=False, encoding="utf-8-sig")

print("✅ Saved (raw)")
print(f"- {out_turnover}  shape={turnover.shape}")
print(f"- {out_night}     shape={night_master.shape}")
print(f"- {out_salary}    shape={salary_master.shape}")

# 書き込み先を取り違えていないか最終確認（読み戻し）
night_check = pd.read_csv(out_night)
print("✅ Reload check (night file) columns:", list(night_check.columns))
display(night_check.head(3))


✅ Saved (raw)
- /Users/hideomi.h/nurse-turnover-analysis/data/raw/日本看護協会_離職率_都道府県別_2023.csv  shape=(47, 4)
- /Users/hideomi.h/nurse-turnover-analysis/data/raw/日本看護協会_夜勤72h超過率_都道府県別_2024.csv     shape=(47, 4)
- /Users/hideomi.h/nurse-turnover-analysis/data/raw/日本看護協会_給与_都道府県別_2024.csv    shape=(47, 4)
✅ Reload check (night file) columns: ['prefecture', 'night_shift_72h_plus', 'night_shifts_per_month_three_shift', 'night_shifts_per_month_two_shift']


Unnamed: 0,prefecture,night_shift_72h_plus,night_shifts_per_month_three_shift,night_shifts_per_month_two_shift
0,北海道,36.7,7.8,4.6
1,青森県,36.5,7.7,4.8
2,岩手県,11.8,7.5,4.1


In [9]:
# ===============================
# (Optional) Debug Helper
# ===============================
def show_tables_on_page(page_idx: int):
    with pdfplumber.open(pdf_path) as pdf:
        tables = pdf.pages[page_idx].extract_tables()
    print(f"page_idx={page_idx}  tables_found={len(tables)}")
    for i, t in enumerate(tables):
        df = pd.DataFrame(t).replace({None: ""}).map(lambda x: str(x).strip())
        print(f"\n--- table_idx={i}  shape={df.shape} ---")
        display(df.head(8))
