Load data

In [53]:
import pandas as pd
import os

# Define the base directory where the files are located
base_dir = '/home/chen/out'

# Initialize an empty DataFrame with the specified columns
columns = ['pretraining_method', 'arch', 'target_task', 'fold', 'score_type', 'score']
df = pd.DataFrame(columns=columns)

# Function to extract information from the file path and name
def extract_info_from_path(file_path):
    parts = file_path.split(os.sep)
    folder_name = parts[-2]  # Get the folder name which contains the csv file
    pretraining_method, arch, target_task,fold = folder_name.split('.')
    fold=int(fold[4:])
    score_type = os.path.basename(file_path).split('_')[-3]  # Extract 'dice' or 'nsd' from the filename
    return pretraining_method, arch, target_task, fold, score_type

# Function to process each CSV file and append its data to the DataFrame
def process_csv_file(file_path):
    global df  # Use the DataFrame defined outside the function
    pretraining_method, arch, target_task, fold, score_type = extract_info_from_path(file_path)
    temp_df = pd.read_csv(file_path)
    new_rows = [] 
    for index, row in temp_df.iterrows():
        for column in temp_df.columns[2:]:
            score = row[column]  # Get the score from the current column
            patient_name = row['name']  # Assuming 'name' column exists and contains patient names
            part_name = column  # The current column name is the part name
            # Create a new dictionary for the row to be added
            new_row = {
                'pretraining_method': pretraining_method,
                'arch': arch,
                'target_task': target_task,
                'fold': fold,
                'score_type': score_type,
                'score': score,
                'patient_name': patient_name,  # Added patient_name
                'part_name': part_name  # Added part_name
            }
            # Append the new row dictionary to the list of new rows
            new_rows.append(new_row)
        # Convert the list of new rows into a DataFrame and concatenate it with the existing DataFrame
    new_rows_df = pd.DataFrame(new_rows)
    df = pd.concat([df, new_rows_df], ignore_index=True)

# Walk through the directory and process each CSV file
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            process_csv_file(file_path)

# Now df contains all the data from the CSV files
print(df.head())  # Display the first few rows of the DataFrame

  df = pd.concat([df, new_rows_df], ignore_index=True)


  pretraining_method       arch target_task fold score_type     score  \
0             suprem  segresnet   vertebrae    1       dice       NaN   
1             suprem  segresnet   vertebrae    1       dice       NaN   
2             suprem  segresnet   vertebrae    1       dice       NaN   
3             suprem  segresnet   vertebrae    1       dice       NaN   
4             suprem  segresnet   vertebrae    1       dice  0.160671   

  patient_name     part_name  
0        s0322  vertebrae_L5  
1        s0322  vertebrae_L4  
2        s0322  vertebrae_L3  
3        s0322  vertebrae_L2  
4        s0322  vertebrae_L1  


In [54]:
print(df.shape)
df.dropna(inplace=True)
print(df.shape)

(375680, 8)
(226220, 8)


Variance of score

In [64]:
fold_scores = df.groupby(['pretraining_method', 'arch', 'target_task', 'score_type',"fold"])['score'].mean().reset_index()
fold_scores

Unnamed: 0,pretraining_method,arch,target_task,score_type,fold,score
0,scratch,segresnet,cardiac,dice,1,0.890586
1,scratch,segresnet,cardiac,dice,2,0.877713
2,scratch,segresnet,cardiac,dice,3,0.884836
3,scratch,segresnet,cardiac,dice,4,0.890934
4,scratch,segresnet,cardiac,dice,5,0.892186
...,...,...,...,...,...,...
75,suprem,segresnet,vertebrae,nsd,1,0.718181
76,suprem,segresnet,vertebrae,nsd,2,0.705251
77,suprem,segresnet,vertebrae,nsd,3,0.719495
78,suprem,segresnet,vertebrae,nsd,4,0.730923


In [65]:
std_scores=fold_scores.groupby(['pretraining_method', 'arch', 'target_task', 'score_type'])['score'].std().reset_index()
std_scores['score'] = std_scores['score'].mul(100).round(1)
std_scores

Unnamed: 0,pretraining_method,arch,target_task,score_type,score
0,scratch,segresnet,cardiac,dice,0.6
1,scratch,segresnet,cardiac,nsd,1.0
2,scratch,segresnet,muscles,dice,0.4
3,scratch,segresnet,muscles,nsd,2.4
4,scratch,segresnet,organs,dice,0.6
5,scratch,segresnet,organs,nsd,0.8
6,scratch,segresnet,vertebrae,dice,1.1
7,scratch,segresnet,vertebrae,nsd,0.9
8,suprem,segresnet,cardiac,dice,1.0
9,suprem,segresnet,cardiac,nsd,1.6


In [66]:
dice_std=std_scores[std_scores['score_type']=='dice']
dice_std

Unnamed: 0,pretraining_method,arch,target_task,score_type,score
0,scratch,segresnet,cardiac,dice,0.6
2,scratch,segresnet,muscles,dice,0.4
4,scratch,segresnet,organs,dice,0.6
6,scratch,segresnet,vertebrae,dice,1.1
8,suprem,segresnet,cardiac,dice,1.0
10,suprem,segresnet,muscles,dice,0.3
12,suprem,segresnet,organs,dice,0.9
14,suprem,segresnet,vertebrae,dice,1.1


Average of score

In [58]:
average_scores = df.groupby(['pretraining_method', 'arch', 'target_task', 'score_type'])['score'].mean().reset_index()
# average_scores

In [59]:
average_scores['score'] = average_scores['score'].mul(100).round(1)


In [60]:
dice_scores = average_scores[average_scores['score_type'] == "dice"]
dice_scores

Unnamed: 0,pretraining_method,arch,target_task,score_type,score
0,scratch,segresnet,cardiac,dice,88.7
2,scratch,segresnet,muscles,dice,93.5
4,scratch,segresnet,organs,dice,89.1
6,scratch,segresnet,vertebrae,dice,85.0
8,suprem,segresnet,cardiac,dice,89.0
10,suprem,segresnet,muscles,dice,94.0
12,suprem,segresnet,organs,dice,89.5
14,suprem,segresnet,vertebrae,dice,86.5
