In [25]:
import os
import pandas as pd

# List of folder names
folders = ['20230809_151846', '20230809_175330', '20230809_200646', '20230810_021129', '20230810_102445']  # Replace with your list

# Base directory (adjust as needed)
base_dir = '/home/fabspace/Documents/EzerM/Malaria_FYP/malaria-detection-in-blood-samples-main/Experiments_log'

# List to collect all dataframes
dfs = []

# Traverse each folder
for folder in folders:
    folder_path = os.path.join(base_dir, folder)
    
    # Check if folder_path is indeed a directory (additional safety step)
    if os.path.isdir(folder_path):
        # Get all CSVs that match the criterion
        for file in os.listdir(folder_path):
            if file.startswith('predictions_record') and file.endswith('.csv'):
                # Construct full path
                file_path = os.path.join(folder_path, file)
                
                # Read CSV into dataframe
                df = pd.read_csv(file_path)
                
                # Add a new column with folder's name
                df['folder_name'] = folder
                
                # Add to our list of dataframes
                dfs.append(df)

# Concatenate all dataframes in the list into one master dataframe
master_df = pd.concat(dfs, ignore_index=True)

# Now master_df contains all the data along with folder names
print(master_df)


     seed FASt-Mal-Code                    Diagnosis  true label  \
0       4    211217-4r1       Severe Malaria Anaemia           1   
1       4     151218-03       Severe Malaria Anaemia           1   
2       4     170418-37       No Malaria, No Anaemia           0   
3       4    300519-9r1          No Malaria, Anaemia           0   
4       4     130219-05       Severe Malaria Anaemia           1   
..    ...           ...                          ...         ...   
395    42     060619-08          No Malaria, Anaemia           0   
396    42     260618-44       No Malaria, No Anaemia           0   
397    42     270219-12  Malaria, Anaemia, No Severe           0   
398    42    090618-1r1       Severe Malaria Anaemia           1   
399    42     140818-10       Severe Malaria Anaemia           1   

     predicted_label  predicted probability      folder_name  
0                  0           3.522053e-01  20230809_151846  
1                  0           3.847700e-01  20230809_151

In [26]:
master_df['is_correct'] = (master_df['true label'] == master_df['predicted_label']).astype(int)


In [27]:
master_df

Unnamed: 0,seed,FASt-Mal-Code,Diagnosis,true label,predicted_label,predicted probability,folder_name,is_correct
0,4,211217-4r1,Severe Malaria Anaemia,1,0,3.522053e-01,20230809_151846,0
1,4,151218-03,Severe Malaria Anaemia,1,0,3.847700e-01,20230809_151846,0
2,4,170418-37,"No Malaria, No Anaemia",0,0,3.411042e-01,20230809_151846,1
3,4,300519-9r1,"No Malaria, Anaemia",0,0,3.902018e-01,20230809_151846,1
4,4,130219-05,Severe Malaria Anaemia,1,0,4.413101e-01,20230809_151846,0
...,...,...,...,...,...,...,...,...
395,42,060619-08,"No Malaria, Anaemia",0,1,9.888277e-01,20230810_102445,0
396,42,260618-44,"No Malaria, No Anaemia",0,0,3.268120e-09,20230810_102445,1
397,42,270219-12,"Malaria, Anaemia, No Severe",0,0,2.950221e-02,20230810_102445,1
398,42,090618-1r1,Severe Malaria Anaemia,1,1,9.783053e-01,20230810_102445,1


In [28]:
# Save to a CSV file
master_df.to_csv('master_df_b.csv', index=False)


In [10]:
master_df[master_df['correct']==False].groupby(['FASt-Mal-Code']).count()

Unnamed: 0_level_0,seed,Diagnosis,true label,predicted_label,predicted probability,folder_name,correct
FASt-Mal-Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
030519-12,4,4,4,4,4,4,4
040818-7r1,1,1,1,1,1,1,1
040918-56,14,14,14,14,14,14,14
051118-16,1,1,1,1,1,1,1
060319-02,4,4,4,4,4,4,4
060619-08,4,4,4,4,4,4,4
061118-01,18,18,18,18,18,18,18
080419-4r1,23,23,23,23,23,23,23
080619-07,4,4,4,4,4,4,4
080818-9r1,10,10,10,10,10,10,10


In [15]:
# Assuming you have already added the 'is_correct' column as described before
grouped = master_df.groupby(['FASt-Mal-Code', 'seed'])

# Calculate the mean for the 'is_correct' column for each group
ratios = grouped['is_correct'].mean().reset_index()

# Rename the 'is_correct' column to 'accuracy_ratio' for clarity
ratios = ratios.rename(columns={'is_correct': 'accuracy_ratio'})

print(ratios)


    FASt-Mal-Code  seed  accuracy_ratio
0      020418-2r3     4        1.000000
1       021018-12    21        1.000000
2       021018-12    42        1.000000
3       030519-12    42        0.555556
4      040818-7r1    42        0.888889
..            ...   ...             ...
135     290919-04     4        1.000000
136     290919-10   105        1.000000
137    300118-1r1   105        1.000000
138     300519-03    21        0.111111
139     300519-16   105        0.000000

[140 rows x 3 columns]


In [16]:
ratios.sort_values(['accuracy_ratio'])

Unnamed: 0,FASt-Mal-Code,seed,accuracy_ratio
139,300519-16,105,0.0
26,080419-4r1,42,0.0
19,061118-01,105,0.0
18,061118-01,42,0.0
85,21118-17,42,0.0
...,...,...,...
84,210519-11,42,1.0
35,090818-06,21,1.0
82,200818-2r1,4,1.0
22,080119-47,42,1.0


In [17]:
# Save to a CSV file
ratios.to_csv('accuracy_ratios.csv', index=False)
