In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [73]:
def find_files(folder_path):
    """
    Recursively find all files in a folder and its subfolders.
    
    Args:
    - folder_path (str): Path to the folder to search.
    
    Returns:
    - file_paths (list): List of paths to all files found.
    """
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths

In [74]:
#find all files in RIPS/Original/ folder
files = find_files("databases/RIPS/Original")
print("Found", len(files), "files.")

#create a dataframe which contains the file paths
df = pd.DataFrame(files, columns=["file_path"])
#replace all backslashes with forward slashes
df['file_path'] = df['file_path'].apply(lambda x: x.replace("\\", "/"))
#add a column for the patient ID
df['patient_id'] = df['file_path'].apply(lambda x: x.split("/")[-4].split(".")[0])


Found 120 files.


In [75]:
#groub by patient ID and count the number of files for each patient, add the count as a new column
df_count = df.groupby("patient_id").count().reset_index()
df_count_n_rows = len(df_count.index)
print("Found", df_count_n_rows, "unique patients.")

Found 4 unique patients.


In [76]:
df_count.head()

Unnamed: 0,patient_id,file_path
0,subject1,30
1,subject2,30
2,subject3,30
3,subject4,30


In [77]:
train_portion = 0.7
test_portion = 0.1
validation_portion = 0.2
#find the closest number of files to the train portion
train_count = int(train_portion * len(files))
test_count = int(test_portion * len(files))
validation_count = int(validation_portion * len(files))

all_counts = np.array([train_count, test_count, validation_count])
#create a dataframe with all possible patient ID combinations
df_combinations = pd.DataFrame(columns=["patient_id", "train", "test", "validation"])


In [78]:
#normalize the counts in df_count
df_count["weight"] = df_count["file_path"] / df_count["file_path"].sum()
#sort the dataframe by the weight
df_count = df_count.sort_values("weight", ascending=False)

df_count.head()

def assign_patients(file_counts, ideal_partition_counts):
    #sort the ideal partition counts in descending order
    ideal_partition_counts = sorted(ideal_partition_counts, reverse=True)
    dynamic_counts = [0 for _ in range(len(ideal_partition_counts))]
    #sort the file counts in descending order
    file_counts = sorted(file_counts, reverse=True)
    partitions = [f'partition_{i}' for i in range(len(ideal_partition_counts))]
    #check if there is an empty partition
    df_assigned = pd.DataFrame(columns=["count", *partitions])
    while len(file_counts) > 0:
        #calculate the difference between the ideal partition counts and the current partition counts
        diff = np.array(ideal_partition_counts) - np.array(dynamic_counts)
        
        #find the index of the partition with the largest difference
        partition_index = np.argmax(diff)
        #check if there are empty partitions
        if min(dynamic_counts) == 0:
            partition_index = np.argmin(dynamic_counts)
        
        #find the index of the patient with the largest count
        count_index = np.argmax(file_counts)
        #assign the patient to the partition with the largest difference
        dynamic_counts[partition_index] += file_counts[count_index]
        print(dynamic_counts)
        #add the patient to the partition as a new row
        zero_list = [0 for _ in range(len(partitions))]
        df_assigned.loc[len(df_assigned.index)] = [file_counts[count_index], *zero_list ]
        df_assigned.loc[len(df_assigned.index)-1, partitions[partition_index]] = 1
        #remove the count from the list
        file_counts.pop(count_index)
    #print(dynamic_counts)
    print(diff)
    return df_assigned



In [79]:
#pass the data to the partitioning function
print(all_counts)
df_assigned = assign_patients(list(df_count["file_path"]), all_counts)
df_assigned.head()

[84 12 24]
[30, 0, 0]
[30, 30, 0]
[30, 30, 30]
[60, 30, 30]
[ 54  -6 -18]


Unnamed: 0,count,partition_0,partition_1,partition_2
0,30,1,0,0
1,30,0,1,0
2,30,0,0,1
3,30,1,0,0


Todo:
1. loa