In [None]:
import pydicom
import numpy as np

def extract_patient_details(dicom_path):
    """Extracts patient sex, age, size, and weight from a dataset with DICOM files."""

    image_to_detail_list = np[[file:{}]]
    for file in dicom_path:
        dicom_data = pydicom.dcmread(file)

        patient_details = {
            "Patient Sex": dicom_data.PatientSex if 'PatientSex' in dicom_data else "Unknown",
            "Patient Age": dicom_data.PatientAge if 'PatientAge' in dicom_data else "Unknown",
            "Patient Size (m)": dicom_data.PatientSize if 'PatientSize' in dicom_data else "Unknown",
            "Patient Weight (kg)": dicom_data.PatientWeight if 'PatientWeight' in dicom_data else "Unknown",
        }

        np[[file:patient_details]]

    return patient_details

# Example usage
dicom_file = "../dataset" 
patient_info = extract_patient_details(dicom_file)


Patient Sex: 
Patient Age: Unknown
Patient Size (m): Unknown
Patient Weight (kg): Unknown


In [13]:
import pandas as pd
import numpy as np

max_files = 100
numpy_filename = "patient_details.npy"
columns = ["Filename", "Sex", "Age", "Size", "Weight"]

df_patient_info = pd.DataFrame(np.load("../patient_details.npy", allow_pickle=True) , columns= columns)
print(df_patient_info.head())

# ANSI colors per column
colors = [
    "\033[0m",     # Filename – default
    "\033[96m",    # Sex – light cyan
    "\033[95m",    # Age – light magenta
    "\033[92m",    # Size – light green
    "\033[93m",    # Weight – light yellow
]
reset = "\033[0m"

# Create a copy for display with colored values
df_colored = df_patient_info.copy()

# Apply color to each column
for i, col in enumerate(df_patient_info.columns):
    df_colored[col] = df_patient_info[col].apply(lambda x: f"{colors[i]}{x}{reset}")

# Print as a string (tabular format)
print(df_colored.head(10).to_string(index=False))
for i, (_, row) in enumerate(df_patient_info.iterrows()):
    if i >= 10:
        break
    row = list(row)
    colored_row = [f"{colors[i]}{str(item)}{reset}" for i, item in enumerate(row)]
    print("\t".join(colored_row))


                                 Filename Sex   Age Size Weight
0  07c12d0f562f17579aabc18c11e2ad54.dicom   M  051Y  NaN    NaN
1  d1605d4007fbbdbec96acce4a834d10b.dicom   F  049Y  NaN    NaN
2  0129a93b23aa71e7ea2b8b988d1c5287.dicom   O   NaN  NaN    NaN
3  da922b5ee573e770260d4f6c849a17a5.dicom   F  049Y  NaN    NaN
4  54d7d530808ec64ea19e9d4ab37cf579.dicom   M   NaN  0.0    0.0
                                      Filename        Sex           Age         Size       Weight
[0m07c12d0f562f17579aabc18c11e2ad54.dicom[0m [96mM[0m [95m051Y[0m [92mnan[0m [93mnan[0m
[0md1605d4007fbbdbec96acce4a834d10b.dicom[0m [96mF[0m [95m049Y[0m [92mnan[0m [93mnan[0m
[0m0129a93b23aa71e7ea2b8b988d1c5287.dicom[0m [96mO[0m  [95mnan[0m [92mnan[0m [93mnan[0m
[0mda922b5ee573e770260d4f6c849a17a5.dicom[0m [96mF[0m [95m049Y[0m [92mnan[0m [93mnan[0m
[0m54d7d530808ec64ea19e9d4ab37cf579.dicom[0m [96mM[0m  [95mnan[0m [92m0.0[0m [93m0.0[0m
[0m8843bf18d7fc51c9389e414c

 #### From Remote to Local (windows)
 scp root@194.164.196.246:/mnt/shared_dataset/  C:\Users\jsayed\Downloads\DHBW\lung-disease-detection
 #### From Local to Remote (linux)
 scp ..\..\full_urls_max_rev.txt root@194.164.196.246:/mnt/shared_dataset/

In [25]:
import os
import random
import shutil

# === CONFIG ===
num_samples = 500
source_folder = "../dataset/train"         # Replace with your source folder
destination_folder = f"../dataset/train-{num_samples}"   # Destination subfolder

# Create destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Get list of image files (you can filter by extension if needed)
all_files = [f for f in os.listdir(source_folder) if os.path.isfile(os.path.join(source_folder, f))]
print(f"Found {len(all_files)} files in source folder.")

# Randomly sample files
sampled_files = random.sample(all_files, min(num_samples, len(all_files)))

# Copy files
for filename in sampled_files:
    src = os.path.join(source_folder, filename)
    dst = os.path.join(destination_folder, filename)
    shutil.copy2(src, dst)

print(f"Copied {len(sampled_files)} files to '{destination_folder}'")


Found 15000 files in source folder.
Copied 500 files to '../dataset/train-500'


In [26]:
import pandas as pd
import os
import json


# Paths
images_dir = f"../dataset/train-{num_samples}"
# === Step 1: Load CSVs ===
annotations = pd.read_csv("../dataset/annotations_train.csv")
image_labels = pd.read_csv("../dataset/image_labels_train.csv")
subset_filenames = {os.path.splitext(f)[0] for f in os.listdir(images_dir)}  # Folder with your images

# === Build class mapping (28 classes: 0–27) ===
disease_cols = image_labels.columns[2:]  # Skip image_id and rad_id
class2idx = {name: i for i, name in enumerate(disease_cols)}
idx2class = {i: name for name, i in class2idx.items()}

# === Filter relevant annotations and assign class index ===
annotations = annotations[annotations['image_id'].isin(subset_filenames)]
annotations['category_id'] = annotations['class_name'].map(class2idx)

# === Remove rows with NaN bbox ===
annotations = annotations.dropna(subset=['x_min', 'y_min', 'x_max', 'y_max'])

# === Compute vote weights ===
vote_count = annotations.groupby(['image_id', 'class_name']).size().reset_index(name='votes')
vote_count['weight'] = vote_count['votes'].apply(lambda v: 1 if v == 1 else 2 if v == 2 else 4)
annotations = annotations.merge(vote_count[['image_id', 'class_name', 'weight']], on=['image_id', 'class_name'])

# === Find true "No finding" images (2+ votes and no annotations) ===
no_finding_votes = image_labels[image_labels["No finding"] == 1]
nf_votes = no_finding_votes.groupby("image_id").size().reset_index(name="votes")
nf_ids = set(nf_votes[nf_votes["votes"] >= 2]["image_id"])
annotated_ids = set(annotations['image_id'])
pure_no_finding = nf_ids - annotated_ids

# === Build COCO JSON ===
images = []
annotations_out = []
categories = []
image_id_map = {}
image_id_counter = 0
annotation_id = 0

# Categories
for cid, cname in idx2class.items():
    categories.append({
        "id": cid,
        "name": cname,
        "supercategory": "disease"
    })

# Images with disease annotations
for image_id in sorted(subset_filenames):
    filename = image_id + ".png"  # adjust if needed
    image_id_map[image_id] = image_id_counter
    images.append({
        "id": image_id_counter,
        "file_name": filename,
        "width": 1024,
        "height": 1024
    })
    image_id_counter += 1

# Add annotations
for _, row in annotations.iterrows():
    image_id_num = image_id_map[row['image_id']]
    x, y = float(row['x_min']), float(row['y_min'])
    w = float(row['x_max'] - row['x_min'])
    h = float(row['y_max'] - row['y_min'])

    annotations_out.append({
        "id": annotation_id,
        "image_id": image_id_num,
        "category_id": int(row['category_id']),
        "bbox": [x, y, w, h],
        "area": w * h,
        "iscrowd": 0,
        "confidence": int(row['weight'])
    })
    annotation_id += 1

# No-finding images (add them without boxes)
for nf_img in pure_no_finding:
    if nf_img not in image_id_map:  # ensure it's not already added
        filename = nf_img + ".png"
        images.append({
            "id": image_id_counter,
            "file_name": filename,
            "width": 1024,
            "height": 1024
        })
        image_id_counter += 1

# Save JSON
coco = {
    "images": images,
    "annotations": annotations_out,
    "categories": categories
}

with open("annotations.json", "w") as f:
    json.dump(coco, f)

print("✅ Saved final COCO JSON with:")
print(" -", len(images), "images")
print(" -", len(annotations_out), "annotations")

✅ Saved final COCO JSON with:
 - 10755 images
 - 1398 annotations
