In [3]:
import json
import os,re

import config
import cv2
import numpy as np
import SimpleITK as sitk
import torch
import utils as ut
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.utils.data import Dataset



In [25]:
" Count number of Patient available"
elements = os.listdir(config.annotation_folder)
len(elements)

68

In [4]:

raw_imgs_path=config.raw_imgs_path,
box_data_path=config.box_data_path,
annotation_folder=config.annotation_folder

def get_bounding_box_for_patient(data, target_patient_name):
    for entry in data:
        if entry["patient_name"] == target_patient_name:
            bounding_box = entry.get("bounding_box", None)
            return bounding_box
    return None

# # Load data from JSON file
# json_file_path = "/Users/giuliamonopoli/Desktop/PhD /deepvalve/data/annotations_old.json"  # Replace with the actual path to your JSON file
# with open(json_file_path, "r") as json_file:
#     data = json.load(json_file)

# # Example usage:
# target_patient_name = "MAD_102_0"  # Replace with the actual patient name
# bounding_box = get_bounding_box_for_patient(data, target_patient_name)

# if bounding_box is not None:
#     print(f"Bounding box for {target_patient_name}: {bounding_box}")
# else:
#     print(f"No data found for {target_patient_name}")


In [47]:
def parse_annotation( file_path):
    """
    Parse the annotation file and extract relevant information.

    Args:
        file_path (str): Path to the annotation file.

    Returns:
        dict: Parsed annotation data.
    """
    with open(file_path) as file:
        lines = file.read().split("\n")

    # Initialize variables and dictionaries
    annotation_data = {}
    key_frames = set()
    patient_name = lines[1]
    patient_name = (
        patient_name.split("/")[-3]
        + "_"
        + patient_name.split("/")[-2].split("_")[-1]
    )  # to get patientname_0/1/2 if multiple images present for the same patient
    error_list = ["no_error"]

    # Extract error information
    for line in lines:
        error_line = lines[2]
        if error_line.strip() == "":
            error_list = ["no_error"]
        # Check for lines with an asterisk (*) indicating errors
        else:
            error_list = error_line.split("<br>") if "<br>" in error_line else [error_line]
            # Remove leading and trailing whitespace from each error
            error_list = [error.strip() for error in error_list if error.strip() != ""]
       
        if "mv_insert" in line:
            key_frame = line.split(" ")[0]
            key_frames.add(key_frame)


    annotation_data["patient_name"] = patient_name
    annotation_data["flags"] = error_list
    annotation_data["key_frames"] = list(key_frames)
    annotation_data["annotations"] = {}
    json_file_path = "/Users/giuliamonopoli/Desktop/PhD /deepvalve/data/annotations_old.json"  
    with open(json_file_path, "r") as json_file:
        data = json.load(json_file)
    annotation_data["bounding_box"] = list(get_bounding_box_for_patient(data, patient_name))

    for line in lines:
        line = line.strip().split()

        # Check if the line starts with a key frame
        if line and line[0] in key_frames:
            frame = line[0]
            key = line[1]
            values = list(tuple(map(int, line[2:])))

            # Initialize nested dictionaries if necessary
            if frame not in annotation_data["annotations"]:
                annotation_data["annotations"][frame] = {}
            if key not in annotation_data["annotations"][frame]:
                annotation_data["annotations"][frame][key] = []

            # Append values to the appropriate key frame and key
            annotation_data["annotations"][frame][key].append(values)

    # Return the populated annotation data dictionary
    return annotation_data



# parse all the txt files for all patients in a json and write them to a file
def get_annotation_json(folder_path="export"):
    """
    Parse all the text files in the specified folder and return the annotation data.

    Args:
        folder_path (str): Path to the folder containing annotation files.

    Returns:
        list: List of annotation data dictionaries.
    """

    annotation_data_list = []

    for root, dirs, files in os.walk(folder_path):
        # print(files)
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)
                try:
                    annotation_data = parse_annotation(file_path)
                    print(annotation_data)
                    
                    annotation_data_list.append(annotation_data)
                except Exception as e:
                    Exception("Error parsing annotation file: ", file_path, e)
    
    with open(config.annotation_json, "w") as json_file:
        json.dump(annotation_data_list, json_file, indent=4)


annotation_folder_path = "/Users/giuliamonopoli/Desktop/PhD /deepvalve/AW_MAD-redo_NF_20231121"

get_annotation_json(annotation_folder_path)

In [27]:
def get_kframes_and_annot_from_mhd( list_p,annotation_data_list, dataset="MAD"):
        """
        Retrieves key frames and corresponding annotations from MHD files.

        Args:
            annotation_data_list (list): List of annotation data dictionaries.
            dataset (str, optional): Dataset identifier. Defaults to "MAD".

        Returns:
            tuple: Tuple containing two lists - `lst_of_matrix_imgs` and `lst_of_annotation_imgs`.
                `lst_of_matrix_imgs` (list): List of matrix images.
                `lst_of_annotation_imgs` (list): List of corresponding annotations.
        """
        lst_of_matrix_imgs = []
        lst_of_annotation_imgs = []
        lst_of_error_codes = []
        lst_of_patient_names = []
        raw_imgs_path = config.raw_imgs_path

      
        for patient in list_p:
            print(patient)
        
                
            patient_annotation_data = [
                annotation_data
                for annotation_data in annotation_data_list
                if annotation_data["patient_name"] == patient 
            ][
                0
            ] 
            # print(patient_annotation_data)
            key_frames = patient_annotation_data["key_frames"]
            print(key_frames)
            center_x, center_y, width, height = patient_annotation_data[
                "bounding_box"
            ]

            max_width = max([x["bounding_box"][2] for x in annotation_data_list])
            max_height = max([x["bounding_box"][3] for x in annotation_data_list])

            for structure_folder in os.listdir(
                os.path.join(raw_imgs_path, patient[:-2])
            ):
                if structure_folder.startswith("LA"):  
                    
                    for file in os.listdir(
                        os.path.join(
                            raw_imgs_path, patient[:-2], structure_folder
                        )
                    ):
                        file_frame = file.split("_")[1].split(".")[0]
                        
                        if file.endswith(".mhd") and file_frame in key_frames:
                            print(file,file_frame)
                        
                            itkimage = sitk.ReadImage(
                                os.path.join(
                                    raw_imgs_path,
                                    patient[:-2],
                                    structure_folder,
                                    file,
                                )
                            )
                            array_img = sitk.GetArrayFromImage(itkimage)

                            # crop array
                            x_min = center_x - width // 2
                            x_max = center_x + width // 2
                            y_min = center_y - height // 2
                            y_max = center_y + height // 2

                            array_img = array_img[(y_min):(y_max), (x_min):(x_max)]

                            # # rescale image
                            array_img = cv2.resize(
                                array_img,
                                (max_width, max_height),
                                interpolation=cv2.INTER_CUBIC,
                            )

                            lst_of_matrix_imgs.append(array_img)
                            lst_of_annotation_imgs.append(
                                patient_annotation_data["annotations"][file_frame]
                            )
                            
                            
                            lst_of_patient_names.append(
                                patient_annotation_data["patient_name"]
                            )
                            
                            

        # assert (
        #     len(lst_of_matrix_imgs)
        #     == len(lst_of_annotation_imgs)
            
        #     == len(lst_of_patient_names)
        # ), "Data length mismatch"
        # return (
        #     lst_of_matrix_imgs,
        #     lst_of_annotation_imgs,
            
        #     lst_of_patient_names,
        # )

In [28]:
with open(config.annotation_json, "r") as json_file:
            annotation_data_list = json.load(json_file)
patients = ['MAD_178_0', 'MAD_149_0', 'MAD_31_0', 'MAD_176_0', 'MAD_182_0', 'MAD_171_0', 'MAD_62_0', 'MAD_96_0', 'MAD_91_0']
get_kframes_and_annot_from_mhd(patients,annotation_data_list)

MAD_178_0
['12', '6']
frame_6.mhd 6
frame_12.mhd 12
MAD_149_0
['5']
frame_5.mhd 5
MAD_31_0
['13', '7']
frame_13.mhd 13
frame_7.mhd 7
MAD_176_0
['12', '19']
frame_19.mhd 19
frame_12.mhd 12
MAD_182_0
['4']
frame_4.mhd 4
MAD_171_0
['5']
frame_5.mhd 5
MAD_62_0
['0', '7']
frame_0.mhd 0
frame_7.mhd 7
MAD_96_0
['17', '7']
frame_17.mhd 17
frame_7.mhd 7
MAD_91_0
['6']
frame_6.mhd 6


In [3]:
import pandas as pd


df = pd.read_csv('/Users/giuliamonopoli/Desktop/PhD /deepvalve/data/data_new.csv')  

# Extract patient names and flags for each patient
patient_names = df['patient_name'].tolist()
flags_list = df[['fp_1', 'fp_2', 'fp_3', 'ff_1', 'ff_2', 'ff_3']].values.tolist()

# Print the results
print("Patient Names:")
print(patient_names)

print("\nFlags List:")
print(flags_list)


Patient Names:
['MAD_178_0', 'MAD_149_0', 'MAD_31_0', 'MAD_176_0', 'MAD_182_0', 'MAD_171_0', 'MAD_62_0', 'MAD_96_0', 'MAD_91_0', 'MAD_65_0', 'MAD_53_0', 'MAD_54_0', 'MAD_39_0', 'MAD_4_0', 'MAD_146_0', 'MAD_3_0', 'MAD_112_0', 'MAD_123_0', 'MAD_63_0', 'MAD_139_0', 'MAD_106_0', 'MAD_108_0', 'MAD_84_0', 'MAD_137_0', 'MAD_130_0', 'MAD_12_0', 'MAD_152_0', 'MAD_109_0', 'MAD_136_0', 'MAD_78_0', 'MAD_107_0', 'MAD_162_0', 'MAD_121_0', 'MAD_92_0', 'MAD_66_0', 'MAD_126_0', 'MAD_95_0', 'MAD_50_0', 'MAD_188_0', 'MAD_1_0', 'MAD_144_0', 'MAD_35_0', 'MAD_175_0', 'MAD_181_0', 'MAD_129_0', 'MAD_127_0', 'MAD_118_0', 'MAD_58_0', 'MAD_120_0', 'MAD_180_0', 'MAD_145_0', 'MAD_142_0', 'MAD_160_0', 'MAD_27_0', 'MAD_167_0', 'MAD_158_0', 'MAD_151_0', 'MAD_169_0', 'MAD_102_0', 'MAD_105_0', 'MAD_73_0', 'MAD_17_0', 'MAD_21_0', 'MAD_161_0', 'MAD_19_0', 'MAD_75_0', 'MAD_132_0', 'MAD_103_0']

Flags List:
[[1, 0, 1, 0, 0, 1], [0, 1, 0, 1, 1, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0], [0, 1, 0, 1, 0, 

In [None]:
import numpy as np
from skmultilearn.model_selection import iterative_train_test_split

from collections import Counter

def split_data():

    " Split data with a new multiclass stratification method " 

    df = pd.read_csv('/Users/giuliamonopoli/Desktop/PhD /deepvalve/data/data_new.csv')  

    # Extract patient names and flags for each patient
    patient_names = df['patient_name'].tolist()
    flags_list = df[['fp_1', 'fp_2', 'fp_3', 'ff_1', 'ff_2', 'ff_3']].values.tolist()
    
    # print(flags_list)
    patient_names = np.array(patient_names)
    patient_names = patient_names.reshape(-1, 1)
    np.random.seed(42)

    flags_list = np.array(flags_list)
    patient_names_train, flags_train, patient_names_test,flags_test = iterative_train_test_split(patient_names, flags_list, test_size = 0.2)
    patient_names_train, flags_train, patient_names_val, flags_val = iterative_train_test_split(patient_names_train, flags_train, test_size = 0.2)

    # return patient_names_train, patient_names_test,flags_train,flags_test
    
    return patient_names_train,patient_names_val, patient_names_test,flags_train,flags_val,flags_test


patient_names_train,patient_names_val, patient_names_test,flags_train,flags_val,flags_test = split_data()

# patient_names_train,patient_names_val, patient_names_test,flags_train,flags_val,flags_test = split_data()
# len(patient_names_test),len(patient_names_train), 14/54


In [70]:
from sklearn.model_selection import train_test_split
from collections import Counter

" Method 1"
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df['patient_name'].tolist(), df[['fp_1', 'fp_3', 'ff_1', 'ff_2', 'ff_3']].values.tolist()

def get_combinations_counts(flags_list):
    class_counts = Counter(map(tuple, flags_list))
    filtered_classes = [list(key) for key, value in class_counts.items() if value > 1]
    one_class = [list(key) for key, value in class_counts.items() if value == 1]
    return filtered_classes, one_class

def split_data(patient_names, flags_list,filtered_classes, one_class, test_size=0.3,step="one", random_state=47):
    flags_list = np.array(flags_list)
    patient_names = np.array(patient_names)

    indices = [index for value in filtered_classes for index, name in enumerate(flags_list) if np.array_equal(name, np.array(value))]
    indices_one = [index for value in one_class for index, name in enumerate(flags_list) if np.array_equal(name, np.array(value))]

    selected_flags = flags_list[indices]
    selected_patients = patient_names[indices]

    patients_one = patient_names[indices_one]
    flags_one = flags_list[indices_one]

    X_train, X_test, y_train, y_test = train_test_split(selected_patients, selected_flags, test_size=test_size, random_state=random_state, stratify=selected_flags)
    
    if step == 'one':
        X_test = np.append(X_test, patients_one)
        y_test = np.append(y_test, flags_one, axis=0)
    elif step == "second":
        np.random.shuffle(patients_one)
        split_index = len(patients_one) // 2
        X_train = np.append(X_train,patients_one[:split_index])
        y_train = np.append(y_train, flags_one[:split_index], axis=0)
        X_test = np.append(X_test,patients_one[:split_index])
        y_test = np.append(y_test, flags_one[split_index:], axis=0)
        

    return X_train, X_test, y_train, y_test

def process_and_split_data(file_path, test_size=0.3, random_state=47):
    patient_names, flags_list = load_data(file_path)
    filtered_classes, one_class = get_combinations_counts(flags_list)
    return split_data(patient_names, flags_list,filtered_classes, one_class, test_size=test_size, random_state=random_state)


X_train, X_test, y_train, y_test = process_and_split_data('/Users/giuliamonopoli/Desktop/PhD /deepvalve/data/data_new.csv')
# filtered_classes_t, one_class_t = get_combinations_counts(y_train)
filtered_classes_t, one_class_t = get_combinations_counts(y_test)

# X_train, X_val, y_train, y_val = split_data(X_train, y_train,filtered_classes_t, one_class_t, test_size=0.3,step="second", random_state=47)
X_test, X_val, y_test, y_val = split_data(X_test, y_test,filtered_classes_t, one_class_t, test_size=0.3,step="second", random_state=47)

print(f"The data has been splitted into {len(X_train)/68} for training, {len(X_val)/68} for validation, and {len(X_test)/68} for test.")


print(f"The data has been splitted into {len(X_train)} for training, {len(X_val)} for validation, and {len(X_test)} for test.")



The data has been splitted into 0.6323529411764706 for training, 0.16176470588235295 for validation, and 0.19117647058823528 for test.
The data has been splitted into 43 for training, 11 for validation, and 13 for test.


In [1]:
" Modification of first method"
from sklearn.model_selection import KFold

from sklearn.model_selection import train_test_split
from collections import Counter

" Method 1"
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df['patient_name'].tolist(), df[['fp_1',  'fp_3', 'ff_1', 'ff_2', 'ff_3']].values.tolist()

def get_combinations_counts(flags_list):
    class_counts = Counter(map(tuple, flags_list))
    filtered_classes = [list(key) for key, value in class_counts.items() if value > 1]
    one_class = [list(key) for key, value in class_counts.items() if value == 1]
    return filtered_classes, one_class

def split_data(patient_names, flags_list,filtered_classes, one_class, test_size=0.3,step="one", random_state=47):
    flags_list = np.array(flags_list)
    patient_names = np.array(patient_names)

    indices = [index for value in filtered_classes for index, name in enumerate(flags_list) if np.array_equal(name, np.array(value))]
    indices_one = [index for value in one_class for index, name in enumerate(flags_list) if np.array_equal(name, np.array(value))]

    selected_flags = flags_list[indices]
    selected_patients = patient_names[indices]

    patients_one = patient_names[indices_one]
    flags_one = flags_list[indices_one]

    X_train, X_test, y_train, y_test = train_test_split(selected_patients, selected_flags, test_size=test_size, random_state=random_state, stratify=selected_flags)
    
    if step == 'one':
        X_test = np.append(X_test, patients_one)
        y_test = np.append(y_test, flags_one, axis=0)
    elif step == "second":
        np.random.shuffle(patients_one)
        split_index = len(patients_one) // 2
        X_train = np.append(X_train,patients_one[:split_index])
        y_train = np.append(y_train, flags_one[:split_index], axis=0)
        X_test = np.append(X_test,patients_one[:split_index])
        y_test = np.append(y_test, flags_one[split_index:], axis=0)
        

    return X_train, X_test, y_train, y_test

def process_and_split_data(file_path, test_size=0.25, random_state=47):
    patient_names, flags_list = load_data(file_path)
    filtered_classes, one_class = get_combinations_counts(flags_list)
    return split_data(patient_names, flags_list,filtered_classes, one_class, test_size=test_size, random_state=random_state)


X_train, X_test, y_train, y_test = process_and_split_data('/Users/giuliamonopoli/Desktop/PhD /deepvalve/data/data_new.csv')
# filtered_classes_t, one_class_t = get_combinations_counts(y_train)
filtered_classes_t, one_class_t = get_combinations_counts(y_test)
k_folds = 5 

kfold = KFold(n_splits=k_folds, shuffle=True)
print(len(X_train))
# Start print
print('--------------------------------')

# K-fold Cross Validation model evaluation
for fold, (train_ids, test_ids) in enumerate(kfold.split(X_train)):

    # Print
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
   
    print(train_subsampler)
# print(f"The data has been split into {len(X_train)/68} for training, {len(X_val)/68} for validation, and {len(X_test)/68} for test.")

# print(f"The data has been splitted into {len(X_train)/68} for training, {len(X_val)/68} for validation, and {len(X_test)/68} for test.")


# print(f"The data has been splitted into {len(X_train)} for training, {len(X_val)} for validation, and {len(X_test)} for test.")



NameError: name 'pd' is not defined

In [8]:
from sklearn.model_selection import train_test_split
from collections import Counter

" Method 2"
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df['patient_name'].tolist(), df[['fp_1', 'fp_2', 'fp_3', 'ff_1', 'ff_2', 'ff_3']].values.tolist()

def get_combinations_counts(flags_list):
    class_counts = Counter(map(tuple, flags_list))
    filtered_classes = [list(key) for key, value in class_counts.items() if value > 1]
    one_class = [list(key) for key, value in class_counts.items() if value == 1]
    return filtered_classes, one_class

def split_data(patient_names, flags_list,filtered_classes, one_class, test_size=0.3, random_state=47):
    flags_list = np.array(flags_list)
    patient_names = np.array(patient_names)

    indices = [index for value in filtered_classes for index, name in enumerate(flags_list) if np.array_equal(name, np.array(value))]
    indices_one = [index for value in one_class for index, name in enumerate(flags_list) if np.array_equal(name, np.array(value))]

    selected_flags = flags_list[indices]
    selected_patients = patient_names[indices]

    patients_one = patient_names[indices_one]
    flags_one = flags_list[indices_one]

    X_train, X_test, y_train, y_test = train_test_split(selected_patients, selected_flags, test_size=test_size, random_state=random_state, stratify=selected_flags)

    X_train = np.append(X_train, patients_one)
    y_train = np.append(y_train, flags_one, axis=0)

    return X_train, X_test, y_train, y_test

def process_and_split_data(file_path, test_size=0.3, random_state=47):
    patient_names, flags_list = load_data(file_path)
    filtered_classes, one_class = get_combinations_counts(flags_list)
    return split_data(patient_names, flags_list,filtered_classes, one_class, test_size=test_size, random_state=random_state)


X_train, X_test, y_train, y_test = process_and_split_data('/Users/giuliamonopoli/Desktop/PhD /deepvalve/data/data_new.csv')

X_train, X_val, y_train, y_val = KFold(n_splits= 2,  random_state=47)




NameError: name 'KFold' is not defined

In [261]:
flags_train_tuples = [tuple(flag) for flag in y_train]
flags_val_tuples = [tuple(flag) for flag in y_val]
flags_test_tuples = [tuple(flag) for flag in y_test]

print("Train set distribution:", Counter(flags_train_tuples))

# Check class distribution in validation set
print("Validation set distribution:", Counter(flags_val_tuples))

# Check class distribution in test set
print( "Test set distribution:",Counter(flags_test_tuples))

Train set distribution: Counter({(0, 0, 1, 1, 0, 1): 4, (0, 0, 1, 0, 1, 1): 3, (0, 1, 0, 1, 0, 0): 3, (0, 0, 0, 0, 0, 0): 3, (1, 0, 0, 0, 0, 0): 2, (0, 0, 1, 1, 1, 1): 2, (0, 0, 0, 1, 0, 0): 2, (0, 1, 1, 1, 0, 1): 1, (0, 0, 1, 0, 0, 1): 1, (1, 0, 0, 1, 0, 0): 1, (1, 0, 1, 0, 0, 1): 1, (1, 0, 1, 1, 1, 1): 1, (0, 0, 0, 1, 1, 0): 1, (1, 0, 0, 1, 1, 1): 1, (0, 0, 0, 0, 0, 1): 1, (0, 1, 0, 1, 1, 0): 1, (0, 1, 0, 1, 0, 1): 1, (1, 0, 1, 0, 1, 1): 1, (1, 1, 1, 0, 0, 1): 1, (1, 0, 0, 0, 1, 1): 1, (0, 0, 0, 1, 0, 1): 1, (1, 1, 0, 0, 0, 1): 1, (0, 1, 0, 1, 1, 1): 1, (1, 0, 0, 0, 1, 0): 1, (1, 0, 0, 1, 0, 1): 1, (0, 0, 0, 1, 1, 1): 1, (1, 1, 0, 1, 0, 1): 1, (0, 0, 0, 0, 1, 0): 1, (0, 1, 1, 0, 0, 1): 1, (1, 0, 1, 1, 0, 1): 1})
Validation set distribution: Counter({(0, 0, 1, 1, 0, 1): 2, (0, 0, 0, 0, 0, 0): 1, (0, 0, 0, 1, 0, 0): 1, (0, 0, 1, 0, 0, 1): 1, (0, 1, 0, 1, 0, 0): 1, (0, 0, 1, 1, 1, 1): 1, (1, 0, 0, 1, 0, 0): 1, (0, 0, 1, 0, 1, 1): 1, (0, 1, 1, 1, 0, 1): 1})
Test set distribution: Counter

In [337]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

" Combinations distribution within the different sets"
# Count occurrences for each dataset
train_counts = Counter(map(tuple, y_train))
val_counts = Counter(map(tuple, y_val))
test_counts = Counter(map(tuple, y_test))

# Create DataFrames
train_df = pd.DataFrame(list(train_counts.items()), columns=['Combination', 'Count_training'])
val_df = pd.DataFrame(list(val_counts.items()), columns=['Combination', 'Count_val'])
test_df = pd.DataFrame(list(test_counts.items()), columns=['Combination', 'Count_test'])

# Merge DataFrames
merged_df = pd.merge(train_df, val_df, on='Combination', how='outer').merge(test_df, on='Combination', how='outer')

# Fill NaN values with 0
merged_df = merged_df.fillna(0)

# Display the merged DataFrame
print("Merged set distribution:")
display(merged_df)


Merged set distribution:


Unnamed: 0,Combination,Count_training,Count_val,Count_test
0,"(0, 1, 1, 0, 1)",8.0,1.0,2.0
1,"(0, 0, 1, 1, 0)",2.0,0.0,1.0
2,"(1, 1, 1, 1, 1)",1.0,0.0,1.0
3,"(1, 0, 0, 0, 0)",2.0,1.0,0.0
4,"(0, 0, 1, 0, 0)",6.0,1.0,2.0
5,"(1, 0, 1, 1, 1)",1.0,0.0,1.0
6,"(0, 1, 0, 0, 1)",2.0,0.0,1.0
7,"(1, 0, 1, 0, 0)",2.0,0.0,1.0
8,"(0, 0, 0, 0, 0)",4.0,1.0,1.0
9,"(1, 0, 1, 0, 1)",1.0,0.0,1.0


In [26]:

from collections import Counter
from pprint import pprint

# def proportions(flags_test):
#     fp1 = np.sum(flags_test[:,0])
#     fp2 = np.sum(flags_test[:,1])
#     fp3 = np.sum(flags_test[:,2])
#     ff1 = np.sum(flags_test[:,3])
#     ff2 = np.sum(flags_test[:,4])
#     ff3 = np.sum(flags_test[:,5])

#     return (fp1,fp2,fp3,ff1,ff2,ff3)
# fp1,fp2,fp3,ff1,ff2,ff3 = proportions(flags_test)
# fp1t,fp2t,fp3t,ff1t,ff2t,ff3t = proportions(flags_train)
# # fp1v,fp2v,fp3v,ff1v,ff2v,ff3v = proportions(flags_val)
# # trainp =fp1+fp1t+fp1v + fp2 +fp2v+fp2t + fp3+fp3v+fp3t
# # print( " proportions flags in test",proportions(flags_test))
# # print( " proportions flags in train",proportions(flags_train))
# # print( " proportions flags in val",proportions(flags_val))
# (fp1+fp2+fp3)/ len(flags_test)



Counter({(0, 0, 1, 0, 1, 1): 5, (0, 0, 1, 1, 0, 1): 5, (0, 1, 0, 1, 0, 0): 4, (0, 0, 0, 1, 0, 0): 4, (0, 0, 0, 0, 0, 0): 3, (1, 0, 0, 1, 0, 0): 3, (0, 1, 1, 1, 0, 1): 3, (0, 0, 1, 1, 1, 1): 3, (1, 0, 1, 0, 0, 1): 2, (1, 0, 0, 0, 0, 0): 2, (0, 0, 1, 0, 0, 1): 2, (0, 0, 0, 0, 0, 1): 2, (0, 0, 0, 1, 1, 0): 2, (1, 0, 1, 1, 1, 1): 2, (0, 1, 0, 1, 1, 0): 1, (1, 1, 1, 0, 0, 1): 1, (1, 0, 0, 0, 1, 1): 1, (0, 0, 0, 1, 0, 1): 1, (1, 1, 0, 0, 0, 1): 1, (0, 1, 0, 1, 1, 1): 1, (1, 0, 0, 0, 1, 0): 1, (1, 0, 0, 1, 0, 1): 1, (0, 0, 0, 1, 1, 1): 1, (1, 1, 0, 1, 0, 1): 1, (0, 0, 0, 0, 1, 0): 1, (1, 0, 1, 1, 0, 1): 1})
Counter({(0, 0, 1, 1, 0, 1): 3, (0, 0, 0, 0, 0, 0): 3, (1, 0, 0, 1, 1, 1): 2, (0, 1, 0, 1, 0, 0): 1, (0, 1, 0, 1, 0, 1): 1, (1, 0, 1, 0, 1, 1): 1, (0, 0, 1, 1, 1, 1): 1, (1, 0, 0, 0, 0, 0): 1, (0, 1, 1, 0, 0, 1): 1})


In [19]:
# from collections import Counter
# import itertools

# all_combinations = list(itertools.product([0, 1], repeat=6))
# all_combinations = np.array(all_combinations)
# # Initialize counters for each set
# train_counter = Counter()
# val_counter = Counter()
# test_counter = Counter()
# # Count occurrences in each set
# for combination in all_combinations:
#     train_counter[tuple(combination)] = np.sum(np.all(flags_train == combination, axis=1))
#     val_counter[tuple(combination)] = np.sum(np.all(flags_val == combination, axis=1))
#     test_counter[tuple(combination)] = np.sum(np.all(flags_val == combination, axis=1))

# # Calculate proportions
# total_train_samples = len(flags_train)
# total_val_samples = len(flags_val)
# total_test_samples = len(flags_test)

# train_proportions = {k: v / total_train_samples for k, v in train_counter.items()}
# val_proportions = {k: v / total_val_samples for k, v in val_counter.items()}
# test_proportions = {k: v / total_test_samples for k, v in test_counter.items()}

# print("Train Proportions:", train_proportions)
# print("Validation Proportions:", val_proportions)
# print("Test Proportions:", test_proportions)

In [None]:
import matplotlib.pyplot as plt

labels = list(train_proportions.keys())
train_values = list(train_proportions.values())
val_values = list(val_proportions.values())
test_values = list(test_proportions.values())

# Set up the figure
fig, ax = plt.subplots(figsize=(10, 6))

# Plot a pie chart for the train set
ax.pie(train_values, labels=labels, autopct='%1.1f%%', startangle=90, colors=['blue', 'green', 'red', 'purple', 'orange', 'pink'])
ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Add a title
plt.title('Train Set Class Combinations Proportions')

# Show the plot
plt.show()


In [None]:
import numpy as np


train_val_diff = np.abs(np.array(train_values) - np.array(val_values)) / np.array(train_values) * 100

# Calcola la differenza percentuale tra le proporzioni di train e test
train_test_diff = np.abs(np.array(train_values) - np.array(test_values)) / np.array(train_values) * 100

# Stampa le differenze percentuali
print("Differenza percentuale tra train e validation:", train_val_diff)
print("Differenza percentuale tra train e test:", train_test_diff)


In [22]:
len(patient_names),len(patient_names_train), len(patient_names_val), len(patient_names_test)

patient_names = ['MAD_21_0', 'MAD_31_0', 'MAD_62_0', 'MAD_62_0', 'MAD_91_0', 'MAD_65_0', 'MAD_53_0', 'MAD_39_0', 'MAD_146_0', 'MAD_3_0', 'MAD_112_0', 'MAD_106_0', 'MAD_108_0', 'MAD_84_0', 'MAD_21_0']

target_values = ['MAD_21_0', 'MAD_62_0']

# Find all indices for each target value and flatten the list
indices = [index for value in target_values for index, name in enumerate(patient_names) if name == value]
indices

[0, 14, 2, 3]