## Check total frames in each patient

In [None]:
import os
import nibabel as nib

folder_path = r"D:\coding\SKRIPSI\ImageCHD_dataset"

sianotik_ids = [1008, 1010, 1012, 1015, 1028, 1037, 1046, 1050, 1064, 1074, 1085, 1092, 1099, 1105, 1111, 1113, 1120, 1125, 1129, 1141, 1145, 1146, 1147, 1150, 1158, 1178]
non_sianotik_ids = [1001, 1002, 1007, 1011, 1014, 1018, 1019, 1020, 1025, 1029, 1033, 1035, 1036, 1041, 1047, 1061, 1070, 1079, 1103, 1109, 1132, 1133, 1135, 1139, 1140, 1148]
normal_ids = [1003, 1005, 1032, 1051, 1062, 1063, 1066, 1067, 1072, 1078, 1080, 1083, 1101, 1116, 1117, 1119, 1127, 1128, 1143, 1144]

def count_frames_per_category(category_ids, label):
    print(f"\n=== {label} ===")
    total_frames = 0
    for id_ in category_ids:
        filename = f"ct_{id_}_image.nii.gz"
        filepath = os.path.join(folder_path, filename)
        if os.path.exists(filepath):
            img = nib.load(filepath)
            shape = img.shape
            num_frames = shape[-1] if len(shape) == 4 else shape[2]
            total_frames += num_frames
            print(f"{id_}: {num_frames} frames")
        else:
            print(f"{id_}: File not found!")
    print(f"Total frames {label}: {total_frames}")

count_frames_per_category(sianotik_ids, "Sianotik")
count_frames_per_category(non_sianotik_ids, "Non-Sianotik")
count_frames_per_category(normal_ids, "Normal")


=== Sianotik ===
1008: 206 frames
1010: 275 frames
1012: 206 frames
1015: 400 frames
1028: 199 frames
1037: 383 frames
1046: 206 frames
1050: 275 frames
1064: 206 frames
1074: 206 frames
1085: 206 frames
1092: 275 frames
1099: 275 frames
1105: 334 frames
1111: 344 frames
1113: 275 frames
1120: 275 frames
1125: 137 frames
1129: 400 frames
1141: 369 frames
1145: 206 frames
1146: 275 frames
1147: 206 frames
1150: 324 frames
1158: 340 frames
1178: 315 frames
Total frames Sianotik: 7118

=== Non-Sianotik ===
1001: 221 frames
1002: 137 frames
1007: 275 frames
1011: 206 frames
1014: 206 frames
1018: 275 frames
1019: 275 frames
1020: 206 frames
1025: 275 frames
1029: 206 frames
1033: 275 frames
1035: 275 frames
1036: 175 frames
1041: 275 frames
1047: 206 frames
1061: 206 frames
1070: 275 frames
1079: 275 frames
1103: 206 frames
1109: 275 frames
1132: 344 frames
1133: 275 frames
1135: 306 frames
1139: 275 frames
1140: 206 frames
1148: 206 frames
Total frames Non-Sianotik: 6337

=== Normal ===


## Convert NifTI format to PNG

In [None]:
import numpy as np
from PIL import Image

# Define directories
nifti_dir = r'D:\coding\SKRIPSI\ImageCHD_dataset'
output_dir = r'D:\coding\SKRIPSI\CNN+LSTM\dataset'

# Define patient IDs
sianotik_ids = [1008, 1010, 1012, 1015, 1028, 1037, 1046, 1050, 1064, 1074, 1085, 1092, 1099, 1105, 1111, 1113, 1120, 1125, 1129, 1141, 1145, 1146, 1147, 1150, 1158, 1178]
non_sianotik_ids = [1001, 1002, 1007, 1011, 1014, 1018, 1019, 1020, 1025, 1029, 1033, 1035, 1036, 1041, 1047, 1061, 1070, 1079, 1103, 1109, 1132, 1133, 1135, 1139, 1140, 1148]
normal_ids = [1003, 1005, 1032, 1051, 1062, 1063, 1066, 1067, 1072, 1078, 1080, 1083, 1101, 1116, 1117, 1119, 1127, 1128, 1143, 1144]

# Combine all ids
all_ids = sianotik_ids + non_sianotik_ids + normal_ids

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

def nifti_to_png(nifti_path, output_path, patient_id):
    # Load NIfTI file
    img = nib.load(nifti_path)
    data = img.get_fdata()

    # Loop through the frames (slices)
    for i in range(data.shape[2]):
        slice_data = data[:, :, i]
        
        # Normalize the slice to [0, 255] for PNG
        slice_data = np.interp(slice_data, (slice_data.min(), slice_data.max()), (0, 255)).astype(np.uint8)
        
        # Convert to PIL Image
        img_slice = Image.fromarray(slice_data)
        
        # Save as PNG with format patient_id_frame_number
        frame_filename = f"{patient_id}_{str(i+1).zfill(3)}.png"
        img_slice.save(os.path.join(output_path, frame_filename))

# Process all NIfTI files for the specified IDs
for patient_id in all_ids:
    nifti_file = os.path.join(nifti_dir, f"ct_{patient_id}_image.nii.gz")
    
    # Check if NIfTI file exists
    if os.path.exists(nifti_file):
        patient_output_dir = os.path.join(output_dir, f"ct_{patient_id}")
        os.makedirs(patient_output_dir, exist_ok=True)
        
        # Convert to PNG
        nifti_to_png(nifti_file, patient_output_dir, patient_id)
    else:
        print(f"File {nifti_file} not found.")

print("Done")

Done


## Generalize the total frame to a total of 275

- dataset with a total above 275 frames will be reduced, for example patient 1158 with 340 frames, the excess is 65 frames (340 - 275 = 65). To adjust the number of frames, 65 frames will be removed: 33 frames will be removed from the beginning and 32 frames from the end, ensuring an even distribution of the removal.

- dataset with a total below 275 frames will be increased (duplicate), for example patient 1008 with 206 frames will short by 69 frames (275 - 206 = 69). To reach the required 275 frames, 69 frames will be duplicated from the middle section of the sequence, specifically from frames 103 to 172.

### Duplicating (below 275)

In [None]:
import shutil

def pad_number(n, length=3):
    return str(n).zfill(length)

def insert_duplicates_midway(input_folder, output_folder, target_frames=275):
    # Get all sorted PNG filenames
    frames = sorted([f for f in os.listdir(input_folder) if f.endswith('.png')])
    original_count = len(frames)
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Parse prefix (e.g., 1001) and start copying originals
    prefix = frames[0].split('_')[0]
    
    for i, fname in enumerate(frames):
        new_name = f"{prefix}_{pad_number(i+1)}.png"
        shutil.copy(os.path.join(input_folder, fname), os.path.join(output_folder, new_name))
    
    if original_count >= target_frames:
        print(f"{prefix}: already has {original_count} frames.")
        return

    # How many to duplicate
    to_duplicate = target_frames - original_count
    start_dup = original_count // 2 - to_duplicate // 2
    end_dup = start_dup + to_duplicate
    print(f"{prefix}: duplicating {to_duplicate} frames from index {start_dup} to {end_dup - 1}")

    # Continue naming from the last index
    current_index = original_count + 1
    for i in range(start_dup, end_dup):
        src_frame = frames[i]
        new_name = f"{prefix}_{pad_number(current_index)}.png"
        shutil.copy(os.path.join(input_folder, src_frame), os.path.join(output_folder, new_name))
        current_index += 1

    print(f"{prefix}: total frames after duplication = {current_index - 1}")

In [None]:
# List of patient indices with <275 frames
underfilled_patients = [
    1001, 1002, 1003, 1008, 1011, 1012, 1014, 1020, 1028, 1029, 1032, 1036, 1046, 1047, 1051, 1061, 1062, 1064, 1066, 1074, 1080, 1085, 1103, 1140, 1143, 1144, 1147,1125, 1145, 1148
]

# Base paths
base_input = r"D:\coding\SKRIPSI\CNN+LSTM\dataset"
base_output = r"D:\coding\SKRIPSI\CNN+LSTM\generalized_dataset"

# Loop through each underfilled patient and call the function
for patient_id in underfilled_patients:
    input_folder = os.path.join(base_input, f"ct_{patient_id}")
    output_folder = os.path.join(base_output, f"ct_{patient_id}")
    
    print(f"patient {patient_id}")
    insert_duplicates_midway(input_folder, output_folder)

patient 1001
1001: duplicating 54 frames from index 83 to 136
1001: total frames after duplication = 275
patient 1002
1002: duplicating 138 frames from index -1 to 136
1002: total frames after duplication = 275
patient 1003
1003: duplicating 69 frames from index 69 to 137
1003: total frames after duplication = 275
patient 1008
1008: duplicating 69 frames from index 69 to 137
1008: total frames after duplication = 275
patient 1011
1011: duplicating 69 frames from index 69 to 137
1011: total frames after duplication = 275
patient 1012
1012: duplicating 69 frames from index 69 to 137
1012: total frames after duplication = 275
patient 1014
1014: duplicating 69 frames from index 69 to 137
1014: total frames after duplication = 275
patient 1020
1020: duplicating 69 frames from index 69 to 137
1020: total frames after duplication = 275
patient 1028
1028: duplicating 76 frames from index 61 to 136
1028: total frames after duplication = 275
patient 1029
1029: duplicating 69 frames from index 69

### Reducing (above 275)

In [None]:
def reduce_frames(input_folder, output_folder, target_frames=275):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Get all PNG files sorted by frame number
    frames = sorted([f for f in os.listdir(input_folder) if f.endswith('.png')])
    original_count = len(frames)

    if original_count <= target_frames:
        print(f"Skipping {input_folder}, only {original_count} frames.")
        return

    excess = original_count - target_frames
    remove_start = excess // 2
    remove_end = excess - remove_start

    # Select the kept frames
    kept_frames = frames[remove_start : original_count - remove_end]

    # Copy to output with updated indexing
    for i, frame in enumerate(kept_frames, 1):
        src_path = os.path.join(input_folder, frame)
        dst_name = f"{os.path.basename(input_folder)[3:]}_{i:03}.png"
        dst_path = os.path.join(output_folder, dst_name)
        shutil.copy(src_path, dst_path)

    print(f"{input_folder}: Reduced from {original_count} to {target_frames} frames.")

In [21]:
# List of patient indexes with more than 275 frames
overfilled_indexes = [
    1015, 1037, 1105, 1111, 1129, 1141, 1150, 1158, 1178, 1132, 1135, 1063, 1072, 1078, 1083, 1101, 1116, 1117, 1128
]

input_base = r"D:\coding\SKRIPSI\CNN+LSTM\dataset"
output_base = r"D:\coding\SKRIPSI\CNN+LSTM\generalized_dataset"

for idx in overfilled_indexes:
    input_folder = os.path.join(input_base, f"ct_{idx}")
    output_folder = os.path.join(output_base, f"ct_{idx}")
    reduce_frames(input_folder, output_folder)

D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1015: Reduced from 400 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1037: Reduced from 383 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1105: Reduced from 334 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1111: Reduced from 344 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1129: Reduced from 400 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1141: Reduced from 369 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1150: Reduced from 324 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1158: Reduced from 340 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1178: Reduced from 315 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1132: Reduced from 344 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1135: Reduced from 306 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1063: Reduced from 310 to 275 frames.
D:\coding\SKRIPSI\CNN+LSTM\dataset\ct_1072: Reduced from 382 to 275 frames.
D:\coding\SK

### Remaining Dataset

In [None]:
dataset_path = r"D:\coding\SKRIPSI\CNN+LSTM\dataset"
generalized_path = r"D:\coding\SKRIPSI\CNN+LSTM\generalized_dataset"

# Get subfolder names from both paths
original_folders = {f.name for f in os.scandir(original_path) if f.is_dir()}
processed_folders = {f.name for f in os.scandir(processed_path) if f.is_dir()}

# Find unprocessed folders
unprocessed_folders = sorted(original_folders - processed_folders)

# Print result
print(f"Total unprocessed folders: {len(unprocessed_folders)}")
for folder in unprocessed_folders:
    print(folder)

Total unprocessed folders: 23
ct_1005
ct_1007
ct_1010
ct_1018
ct_1019
ct_1025
ct_1033
ct_1035
ct_1041
ct_1050
ct_1067
ct_1070
ct_1079
ct_1092
ct_1099
ct_1109
ct_1113
ct_1119
ct_1120
ct_1127
ct_1133
ct_1139
ct_1146


In [11]:
remaining_patient = [
    "ct_1005", "ct_1007", "ct_1010", "ct_1018", "ct_1019", "ct_1025", "ct_1033", "ct_1035", "ct_1041", "ct_1050", "ct_1067", "ct_1070", "ct_1079", "ct_1092", "ct_1099", "ct_1109", "ct_1113", "ct_1119", "ct_1120", "ct_1127", "ct_1133", "ct_1139", "ct_1146"
]

for folder in remaining_patient:
    folder_path = os.path.join(dataset_path, folder)
    num_files = len([f for f in os.listdir(folder_path) if f.endswith('.png')])
    print(f"{folder}: {num_files} frames")

ct_1005: 275 frames
ct_1007: 275 frames
ct_1010: 275 frames
ct_1018: 275 frames
ct_1019: 275 frames
ct_1025: 275 frames
ct_1033: 275 frames
ct_1035: 275 frames
ct_1041: 275 frames
ct_1050: 275 frames
ct_1067: 275 frames
ct_1070: 275 frames
ct_1079: 275 frames
ct_1092: 275 frames
ct_1099: 275 frames
ct_1109: 275 frames
ct_1113: 275 frames
ct_1119: 275 frames
ct_1120: 275 frames
ct_1127: 275 frames
ct_1133: 275 frames
ct_1139: 275 frames
ct_1146: 275 frames


In [None]:
for folder in remaining_patient:
    src = os.path.join(dataset_path, folder)
    dst = os.path.join(generalized_path, folder)

    if not os.path.exists(dst):
        shutil.copytree(src, dst)
        print(f"Copied: {folder}")
    else:
        print(f"Already exists: {folder}")

Copied: ct_1005
Copied: ct_1007
Copied: ct_1010
Copied: ct_1018
Copied: ct_1019
Copied: ct_1025
Copied: ct_1033
Copied: ct_1035
Copied: ct_1041
Copied: ct_1050
Copied: ct_1067
Copied: ct_1070
Copied: ct_1079
Copied: ct_1092
Copied: ct_1099
Copied: ct_1109
Copied: ct_1113
Copied: ct_1119
Copied: ct_1120
Copied: ct_1127
Copied: ct_1133
Copied: ct_1139
Copied: ct_1146


## DATASET CONCLUSION

In [13]:
total_folders = 0
total_images = 0

for folder in os.listdir(generalized_path):
    folder_path = os.path.join(generalized_path, folder)
    if os.path.isdir(folder_path):
        total_folders += 1
        image_count = len([f for f in os.listdir(folder_path) if f.endswith(".png")])
        total_images += image_count

print(f"Total folders (patients): {total_folders}")
print(f"Total image files (data): {total_images}")

Total folders (patients): 72
Total image files (data): 19800


## Resize Image (for memory-efficiency)

In [None]:
from tqdm import tqdm

original_dataset_path = r"C:\Users\risuser\Documents\RISET_FATHAN\generalized_dataset"
resized_dataset_path = r"C:\Users\risuser\Documents\RISET_FATHAN\resized_dataset"
target_size = (256, 256)  

for patient_folder in tqdm(os.listdir(original_dataset_path)):
    if not patient_folder.startswith("ct_"):
        continue

    patient_dir = os.path.join(original_dataset_path, patient_folder)

    patient_id = patient_folder.split("_")[1]

    resized_patient_dir = os.path.join(resized_dataset_path, f"ct_{patient_id}")
    os.makedirs(resized_patient_dir, exist_ok=True)

    for img_file in os.listdir(patient_dir):
        img_path = os.path.join(patient_dir, img_file)
        
        try:
            with Image.open(img_path).convert("L") as img:  
                img_resized = img.resize(target_size, Image.LANCZOS)
                save_path = os.path.join(resized_patient_dir, img_file)
                img_resized.save(save_path) 
        except Exception as e:
            print(f"Gagal memproses {img_path}: {e}")

100%|██████████| 72/72 [01:57<00:00,  1.63s/it]
