In [None]:
import requests
from bs4 import BeautifulSoup
import os
import getpass
from tqdm import tqdm

# Helper to clean folder/file names
def clean_name(name):
    return name.split('#')[0].split('?')[0].strip('/')

# Login credentials
physio_username = input("PhysioNet username: ")
physio_password = getpass.getpass("PhysioNet password: ")

login_url = "https://physionet.org/login/"
base_url = "https://physionet.org/content/mimic-cxr/2.1.0/files/p10/"
local_base = "/content/mimic-cxr-p10"
os.makedirs(local_base, exist_ok=True)

# Start a session
session = requests.Session()

# Step 1: Get CSRF token
login_page = session.get(login_url)
soup = BeautifulSoup(login_page.text, 'html.parser')
csrf_token = soup.find('input', {'name': 'csrfmiddlewaretoken'}).get('value')

# Step 2: Post login data
login_data = {
    'username': physio_username,
    'password': physio_password,
    'csrfmiddlewaretoken': csrf_token,
    'next': '/'
}
headers = {'Referer': login_url}

# Perform login
login_response = session.post(login_url, data=login_data, headers=headers)
if login_response.url == "https://physionet.org/":
    print("✅ Login successful!")
else:
    print("❌ Login failed. Check your credentials or DUA acceptance.")
    raise SystemExit()

# Step 3: Fetch patient folders in p10
response = session.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')
patient_folders = [clean_name(a['href']) for a in soup.select("a[href^='p10']")]

print(f"Found {len(patient_folders)} patient folders.")

# Step 4: Download only .txt files directly under patient folders
for patient in tqdm(patient_folders, desc="Patients"):
    patient_folder_url = f"{base_url}{patient}/"
    local_patient_path = os.path.join(local_base, patient)
    os.makedirs(local_patient_path, exist_ok=True)

    r = session.get(patient_folder_url)
    s = BeautifulSoup(r.content, 'html.parser')

    # Download .txt files directly under patient folder (ignore subfolders)
    txt_files = [clean_name(a['href']) for a in s.select("a[href$='.txt']")]
    for txt_file in txt_files:
        file_url = f"{patient_folder_url}{txt_file}"
        file_path = os.path.join(local_patient_path, txt_file)
        if not os.path.exists(file_path):
            print(f"⬇️ Downloading {file_path}...")
            with session.get(file_url, stream=True) as resp:
                total_size = int(resp.headers.get('content-length', 0))
                with open(file_path, 'wb') as f, tqdm(
                    desc=txt_file,
                    total=total_size,
                    unit='B',
                    unit_scale=True,
                    unit_divisor=1024
                ) as bar:
                    for chunk in resp.iter_content(chunk_size=8192):
                        f.write(chunk)
                        bar.update(len(chunk))
        else:
            print(f"⚠️ {txt_file} already exists, skipping.")


In [None]:
import os
import pandas as pd

# Base directory
base_dir = '/content/mimic-cxr-p10'

# Prepare list to hold rows
data = []

# Walk through the directory
for patient_id in os.listdir(base_dir):
    patient_path = os.path.join(base_dir, patient_id)
    if os.path.isdir(patient_path):
        for file in os.listdir(patient_path):
            if file.endswith('.txt'):
                study_id = file.replace('.txt', '')
                txt_path = os.path.join(patient_path, file)
                with open(txt_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                data.append({
                    'patient_id': patient_id,
                    'study_id': study_id,
                    'text': text
                })

# Create DataFrame
df = pd.DataFrame(data)

# Export to CSV
output_csv = 'mimic_reports.csv'
df.to_csv(output_csv, index=False)

print(f"Exported {len(df)} rows to {output_csv}")
