In [510]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

def collect_npy_data(base_directory):
    """
    Traverse the directory structure and collect paths to .npy files.
    
    Args:
    - base_directory (str): Root directory containing .npy files.
    
    Returns:
    - DataFrame: Contains .npy file path, series_id, and numpy data for each .npy file.
    """
    # List to collect data
    npy_data = []

    # Iterate through all files in the directory
    for file_name in tqdm(os.listdir(base_directory), desc="Processing .npy files"):
        # Check if the current file is a .npy file
        if file_name.endswith('.npy'):
            series_id = int(file_name.split("_")[0])  # Extracting series_id from the filename
            file_path = os.path.join(base_directory, file_name)
            
            # Load numpy data
            npy_array = np.load(file_path)
            
            # Append details to the list
            npy_data.append({
                'npy_path': file_path,
                'series_id': series_id,
                'npy_data': npy_array
            })

    # Convert the list into a DataFrame
    npy_df = pd.DataFrame(npy_data)
    
    return npy_df

# Example usage
base_directory = 'volume_info'  # Replace with your directory path
npy_df = collect_npy_data(base_directory)

npy_df=npy_df.sort_values(by='series_id',axis=0).reset_index(drop=True)

Processing .npy files: 100%|█████████████████████████████████████████████████████████████████████████| 4579/4579 [00:00<00:00, 6333.23it/s]


In [511]:
def get_first_last_indices(array, value):
    """
    Get the first and last index of a value in a numpy array.

    Args:
    - array (numpy.ndarray): The input numpy array.
    - value (int/float): The value to search for.

    Returns:
    - tuple: (first index, last index) normalized by dividing by the total length of the array.
    """
    indices = np.where(array == value)[0]
    if indices.size == 0:
        return [0, 0]
    first_index = indices[0] / len(array)
    last_index = indices[-1] / len(array)
    return [first_index, last_index]

def trim_zero_rows(matrix):
    """
    Remove rows from the top and bottom of the matrix that contain only zeros.
    Stop when a row with a non-zero entry is encountered from both directions.

    Args:
    - matrix (numpy.ndarray): 2D numpy array

    Returns:
    - numpy.ndarray: Trimmed matrix
    """
    # Find the index of the first row from the top that contains a non-zero entry
    first_non_zero_row = np.argmax(np.any(matrix != 0, axis=1))
    
    # Find the index of the first row from the bottom that contains a non-zero entry
    last_non_zero_row = matrix.shape[0] - 1 - np.argmax(np.any(matrix[::-1] != 0, axis=1))
    
    # Slice the matrix between these two rows
    trimmed_matrix = matrix[first_non_zero_row:last_non_zero_row+1]
    
    return trimmed_matrix


# ['liver', 'kidneys', 'spleen', 'bowel']

liver_range_list = []
kidneys_range_list = []
spleen_range_list = []
bowel_range_list = []

bad_liver_indices = []
bad_kidneys_indices = []
bad_spleen_indices = []
bad_bowel_indices = []

series_depth_lengths = []

for i in range(len(npy_df.npy_data)):
    temp = npy_df.npy_data.iloc[i]
    temp = trim_zero_rows(temp)

    series_depth_lengths.append(len(temp[:,0]))
    
    liver_range = get_first_last_indices(temp[:,0], 1)
    kidneys_range = get_first_last_indices(temp[:,1], 1)
    spleen_range = get_first_last_indices(temp[:,2], 1)
    bowel_range = get_first_last_indices(temp[:,3], 1)
    
    # Check if the range of each organ is (0,0) and append the index `i` to the corresponding list
    if liver_range == [0,0]:
        bad_liver_indices.append(i)
    if kidneys_range == [0,0]:
        bad_kidneys_indices.append(i)
    if spleen_range == [0,0]:
        bad_spleen_indices.append(i)
    if bowel_range == [0,0]:
        bad_bowel_indices.append(i)
    
    # Append the range values to the respective lists
    liver_range_list.append(liver_range)
    kidneys_range_list.append(kidneys_range)
    spleen_range_list.append(spleen_range)
    bowel_range_list.append(bowel_range)

bad_cases = np.unique(bad_liver_indices + bad_kidneys_indices + bad_spleen_indices + bad_bowel_indices)

liver_range_list = np.array(liver_range_list)
kidneys_range_list = np.array(kidneys_range_list)
spleen_range_list = np.array(spleen_range_list)
bowel_range_list = np.array(bowel_range_list)

mask = np.ones(liver_range_list.shape[0], dtype=bool)
mask[bad_cases] = False

original_liver_range_list = liver_range_list#[bad_cases, :]
original_kidneys_range_list = kidneys_range_list#[bad_cases, :]
original_spleen_range_list = spleen_range_list#[bad_cases, :]
original_bowel_range_list = bowel_range_list#[bad_cases,:]

liver_range_list = liver_range_list[mask, :]
kidneys_range_list = kidneys_range_list[mask, :]
spleen_range_list = spleen_range_list[mask, :]
bowel_range_list = bowel_range_list[mask,:]


In [512]:
# Initialize a dictionary to store the bad organs for each unique index
bad_organs_dict = {}

for idx in bad_cases:
    is_bad_liver = int(idx in bad_liver_indices)
    is_bad_kidneys = int(idx in bad_kidneys_indices)
    is_bad_spleen = int(idx in bad_spleen_indices)
    is_bad_bowel = int(idx in bad_bowel_indices)
    
    # Store the tuple (or list) for this index
    bad_organs_dict[idx] = (is_bad_liver, is_bad_kidneys, is_bad_spleen, is_bad_bowel)


In [513]:
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def generate_datasets(input_organs, target_organ):
    # Convert the organ names to their respective lists
    organ_dict = {
        "liver": liver_range_list,
        "kidneys": kidneys_range_list,
        "spleen": spleen_range_list,
        "bowel": bowel_range_list
    }
    
    # Create input and target datasets
    X = np.hstack([organ_dict[organ] for organ in input_organs])
    y = organ_dict[target_organ]
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

# Define the combinations
combinations = [
    (["liver", "bowel"], "kidneys"),
    (["liver", "spleen", "bowel"], "kidneys"),
    (["liver", "bowel"], "spleen"),
    (["liver", "kidneys", "bowel"], "spleen"),
    (["liver", "kidneys", "spleen"], "bowel"),  
    (["liver", "kidneys"], "bowel"),  
    (["liver"], "bowel"),  
    (["liver"], "kidneys"),
    (["liver"], "spleen")
]

# For each combination
for input_organs, target_organ in combinations:
    X_train, X_test, y_train, y_test = generate_datasets(input_organs, target_organ)
    
    # Initialize the model
    model = RandomForestRegressor(n_estimators=200, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Save the model
    model_filename = f"model_predicting_{target_organ}_using_{'_'.join(input_organs)}.joblib"
    joblib.dump(model, model_filename)
    print(f"Model saved as {model_filename}")
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    print(f"Using {input_organs} to predict {target_organ} - Mean Squared Error: {mse}")


Model saved as model_predicting_kidneys_using_liver_bowel.joblib
Using ['liver', 'bowel'] to predict kidneys - Mean Squared Error: 0.008121130851737997
Model saved as model_predicting_kidneys_using_liver_spleen_bowel.joblib
Using ['liver', 'spleen', 'bowel'] to predict kidneys - Mean Squared Error: 0.00597411826090619
Model saved as model_predicting_spleen_using_liver_bowel.joblib
Using ['liver', 'bowel'] to predict spleen - Mean Squared Error: 0.008771595875657612
Model saved as model_predicting_spleen_using_liver_kidneys_bowel.joblib
Using ['liver', 'kidneys', 'bowel'] to predict spleen - Mean Squared Error: 0.006961597896065943
Model saved as model_predicting_bowel_using_liver_kidneys_spleen.joblib
Using ['liver', 'kidneys', 'spleen'] to predict bowel - Mean Squared Error: 0.003018561741056083
Model saved as model_predicting_bowel_using_liver_kidneys.joblib
Using ['liver', 'kidneys'] to predict bowel - Mean Squared Error: 0.0034926491443415964
Model saved as model_predicting_bowel_u

In [514]:
import joblib

models_dict = {}

# Directory where your models are saved
models_directory = os.getcwd()

for input_organs, target_organ in combinations:
    model_filename = f"model_predicting_{target_organ}_using_{'_'.join(input_organs)}.joblib"
    model_path = os.path.join(models_directory, model_filename)
    
    # Load the model and save to the dictionary
    model = joblib.load(model_path)
    key = (tuple(input_organs), target_organ)
    models_dict[key] = model

In [515]:
def get_data_for_index(index, available_organs):
    """
    Extracts and returns the data corresponding to the given index for the specified organs.
    """
    data_list = []
    
    if "liver" in available_organs:
        data_list.append(original_liver_range_list[index])
    if "kidneys" in available_organs:
        data_list.append(original_kidneys_range_list[index])
    if "spleen" in available_organs:
        data_list.append(original_spleen_range_list[index])
    if "bowel" in available_organs:
        data_list.append(original_bowel_range_list[index])
    
    # Combine the data as required
    combined_data = np.hstack(data_list)
    
    return combined_data

In [516]:
# Dictionary to store predictions
predictions = {}

# Iterate through the bad_organs_dict
for index, bad_organs in bad_organs_dict.items():
    # Convert bad_organs to a list of organ names
    available_organs = []
    for i, organ in enumerate(['liver', 'kidneys', 'spleen', 'bowel']):
        if bad_organs[i] == 0:  # If the organ is available
            available_organs.append(organ)
    
    # For each organ, if it's missing, try to predict it using the available organs
    for i, organ in enumerate(['liver', 'kidneys', 'spleen', 'bowel']):
        if bad_organs[i] == 1:  # If the organ is missing
            key = (tuple(available_organs), organ)
            model = models_dict.get(key)
            if model:
                data = get_data_for_index(index, available_organs)
                prediction = model.predict([data])
                predictions[(index, organ)] = prediction[0]

In [517]:
# Assuming predictions_dict is the dictionary where you've saved the predictions
# For example: predictions_dict = {(11, 'kidneys'): array([0.71556257, 0.94957948]), ...}
for (index, organ), prediction in predictions.items():
    if organ == 'kidneys':
        original_kidneys_range_list[index] = prediction
    elif organ == 'liver':
        original_liver_range_list[index] = prediction
    elif organ == 'spleen':
        original_spleen_range_list[index] = prediction
    elif organ == 'bowel':
        original_bowel_range_list[index] = prediction


In [518]:
definite_kidneys_range_list = original_kidneys_range_list.T*series_depth_lengths
definite_kidneys_range_list = definite_kidneys_range_list.T

definite_liver_range_list = original_liver_range_list.T*series_depth_lengths
definite_liver_range_list = definite_liver_range_list.T

definite_spleen_range_list = original_spleen_range_list.T*series_depth_lengths
definite_spleen_range_list = definite_spleen_range_list.T

definite_bowel_range_list = original_bowel_range_list.T*series_depth_lengths
definite_bowel_range_list = definite_bowel_range_list.T


In [398]:
import numpy as np
from PIL import Image
import cv2

In [451]:
def process_channel(channel):
    # 1. Thresholding
    min_HU = 55  # You'll need to adjust these values based on your dataset
    max_HU = 250
    _, thresh = cv2.threshold(channel, min_HU, max_HU, cv2.THRESH_BINARY)

    thresh = cv2.medianBlur(thresh, 5)
    # 2. Morphological Operations
    kernel = np.ones((3,3), np.uint8)
    dilated = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)

    num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(dilated, 8, cv2.CV_32S)
    areas = stats[:,-1]
    
    # Exclude the background label and get the two largest areas
    largest_indices = np.argsort(areas[1:])[-2:] + 1
    
    connected = np.zeros_like(dilated, np.uint8)
    for idx in largest_indices:
        connected[labels == idx] = 255
    # 5. Masking
    result_channel = cv2.bitwise_and(channel, connected)
    result_channel[result_channel > 210] = 0

    return result_channel

def process_image(image_path):
    # Read image
    image = cv2.imread(image_path)

    # Split into RGB channels
    r, g, b = cv2.split(image)
    
    # Process each channel
    r_processed = process_channel(r)
    g_processed = process_channel(g)
    b_processed = process_channel(b)
    
    # Merge processed channels
    processed_image = cv2.merge([r_processed, g_processed, b_processed])

    return processed_image

def batch_process_image(filepaths, output_dir):
    """
    Process and save images to the specified output directory.
    
    Args:
    - filepaths (list): List of file paths to process.
    - output_dir (str): Directory where the processed images will be saved.
    """
    for filepath in filepaths:
        processed_image = process_image(filepath)
        
        # Construct output path
        output_path = os.path.join(output_dir, os.path.basename(filepath))
        
        # Save the processed image
        cv2.imwrite(output_path, processed_image)
        print(f"Processed image saved to {output_path}")

In [519]:
import os
import cv2
import pandas as pd
from tqdm import tqdm

def collect_jpg_data(base_directory):
    """
    Traverse the directory structure and collect paths to .jpg files.
    
    Args:
    - base_directory (str): Root directory containing subdirectories of .jpg files.
    
    Returns:
    - DataFrame: Contains .jpg file path, series_id, and image data for each .jpg file.
    """
    # List to collect data
    jpg_data = []

    # Iterate through all subdirectories in the base directory
    for sub_dir in tqdm(os.listdir(base_directory)):
        sub_dir_path = os.path.join(base_directory, sub_dir)
        
        if os.path.isdir(sub_dir_path):
            series_id = int(sub_dir)  # Assuming the subdirectory name is the series_id
            
            # Iterate through all files in the subdirectory
            for file_name in os.listdir(sub_dir_path):
                # Check if the current file is a .jpg file
                if file_name.endswith('.jpg'):
                    file_path = os.path.join(sub_dir_path, file_name)

                    name = file_name.split('.')[0]
                    # Append details to the list
                    jpg_data.append({
                        'jpg_path': file_path,
                        'series_id': series_id,
                        'file_name': name
                    })

    # Convert the list into a DataFrame
    jpg_df = pd.DataFrame(jpg_data)
    
    return jpg_df

# Example usage
base_directory = 'volume_images'  # Replace with your directory path
jpg_df = collect_jpg_data(base_directory)
jpg_df=jpg_df.sort_values(by='series_id',axis=0).reset_index(drop=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 4579/4579 [00:00<00:00, 10324.89it/s]


In [525]:
base_path = 'series_image_split' 

if not os.path.exists(base_path):
    os.makedirs(base_path)

In [546]:
def extract_and_save_images_from_paths(series_id, depth_range, organ, filepaths, filenames, base_path):
    """
    Extracts and saves images based on depth ranges for each organ.
    
    Args:
    - series_id (str): The ID of the series.
    - depth_ranges (list of tuples): The depth ranges of the organ in the format [(start1, end1), (start2, end2), ...].
    - organ (str): The name of the organ (e.g., "liver", "kidneys").
    - filepaths (list of str): List of paths to the image files.
    - base_path (str): Base directory to save the extracted images.
    """
    series_path = os.path.join(base_path, series_id)
    organ_path = os.path.join(series_path, organ)
    
    if not os.path.exists(organ_path):
        os.makedirs(organ_path)

    start, end = depth_range
    if organ == 'bowel' and end > 250:
        start = start + 30
        
    for i in range(len(filepaths)):
        filepath = filepaths[i]
        filename = filenames[i]
        # Extracting the filename without extension            
        # Check if the filename is a number and within the specified depth range
        if filename.isdigit() and start <= int(filename) <= end:

            img = Image.open(filepath)
            save_path = os.path.join(organ_path, f"{filename}.jpg")
            img.save(save_path)

In [548]:
for i in tqdm(range(len(series_list))):
    series_list = np.unique(jpg_df.series_id)
    curr_id = series_list[i]
    jpg_path_list = np.array(jpg_df[jpg_df.series_id==curr_id].jpg_path)
    jpg_name_list = np.array(jpg_df[jpg_df.series_id==curr_id].file_name)
    
    extract_and_save_images_from_paths(str(curr_id), definite_kidneys_range_list[i], 'kidneys', jpg_path_list, jpg_name_list, base_path)
    extract_and_save_images_from_paths(str(curr_id), definite_bowel_range_list[i], 'bowel', jpg_path_list, jpg_name_list, base_path)
    extract_and_save_images_from_paths(str(curr_id), definite_liver_range_list[i], 'liver', jpg_path_list, jpg_name_list, base_path)
    extract_and_save_images_from_paths(str(curr_id), definite_spleen_range_list[i], 'spleen', jpg_path_list, jpg_name_list, base_path)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 4579/4579 [09:07<00:00,  8.37it/s]


In [550]:
import os

base_folder = "series_image_split"  # Replace with the actual path to your base folder

# List of organs
organs = ["kidneys", "liver", "spleen", "bowel"]

# Dictionary to store the counts
distribution = {}

# Iterate through each series_id folder
for series_id in os.listdir(base_folder):
    series_path = os.path.join(base_folder, series_id)
    
    # Ensure it's a directory and not a file
    if os.path.isdir(series_path):
        distribution[series_id] = {}
        
        # Iterate through each organ folder under the current series_id folder
        for organ in organs:
            organ_path = os.path.join(series_path, organ)
            
            # Check if the organ directory exists
            if os.path.exists(organ_path):
                # Count the number of image files in the organ directory
                count = sum([1 for file in os.listdir(organ_path) if file.endswith('.jpg') or file.endswith('.png')])  # Assuming images are in .jpg or .png format
                distribution[series_id][organ] = count
            else:
                distribution[series_id][organ] = 0

In [556]:
distribution

{'17473': {'kidneys': 36, 'liver': 41, 'spleen': 21, 'bowel': 52},
 '64331': {'kidneys': 6, 'liver': 15, 'spleen': 9, 'bowel': 22},
 '7246': {'kidneys': 23, 'liver': 30, 'spleen': 25, 'bowel': 66},
 '2797': {'kidneys': 11, 'liver': 28, 'spleen': 16, 'bowel': 44},
 '4964': {'kidneys': 28, 'liver': 27, 'spleen': 17, 'bowel': 78},
 '12474': {'kidneys': 43, 'liver': 60, 'spleen': 23, 'bowel': 88},
 '13781': {'kidneys': 0, 'liver': 15, 'spleen': 8, 'bowel': 13},
 '39625': {'kidneys': 71, 'liver': 51, 'spleen': 57, 'bowel': 85},
 '10981': {'kidneys': 41, 'liver': 50, 'spleen': 31, 'bowel': 80},
 '41330': {'kidneys': 13, 'liver': 27, 'spleen': 8, 'bowel': 19},
 '17227': {'kidneys': 39, 'liver': 72, 'spleen': 36, 'bowel': 87},
 '36813': {'kidneys': 18, 'liver': 22, 'spleen': 8, 'bowel': 51},
 '40255': {'kidneys': 36, 'liver': 36, 'spleen': 23, 'bowel': 73},
 '63418': {'kidneys': 25, 'liver': 24, 'spleen': 19, 'bowel': 70},
 '33395': {'kidneys': 9, 'liver': 20, 'spleen': 7, 'bowel': 8},
 '4127'

In [552]:
import matplotlib.pyplot as plt

# Assuming you already have the 'distribution' dictionary from the previous code snippet

# Extract data for plotting
series_ids = list(distribution.keys())
kidneys_counts = [distribution[sid]['kidneys'] for sid in series_ids]
liver_counts = [distribution[sid]['liver'] for sid in series_ids]
spleen_counts = [distribution[sid]['spleen'] for sid in series_ids]
bowel_counts = [distribution[sid]['bowel'] for sid in series_ids]


In [None]:
20*3

In [567]:
4 inputs *15 images

60

In [568]:
np.mean(kidneys_counts)

21.992138021402052

In [561]:
np.mean(liver_counts)

31.748635073160077

In [563]:
np.mean(spleen_counts)

17.555143044332823

In [565]:
np.mean(bowel_counts)

51.67613015942346