In [7]:
import os
import pandas as pd
import ast
import re
import plotly.express as px
import plotly.graph_objects as go

In [8]:
def load_excel_to_dataframe(excel_dir, dataset_name):
    # List all files in the excel directory
    files = os.listdir(excel_dir)
    
    # Filter for Excel files that contain the dataset name
    excel_files = [f for f in files if f.endswith('.xlsx') and dataset_name in f]
    
    if not excel_files:
        print(f'No Excel file found containing the dataset name "{dataset_name}" in directory "{excel_dir}".')
        return None
    
    # Assuming there's only one matching file, get the first one
    file_path = os.path.join(excel_dir, excel_files[0])
    
    # Read the Excel file into a DataFrame
    df = pd.read_excel(file_path)
    
    return df

In [9]:
def fix_format(df, metric):
    # Fix string formatting issues in the 'accuracy' column
    def fix_string_list(s):
        s = re.sub(r'\s+', ',', s.strip())  # Replace multiple spaces with a single comma
        s = s.replace('[,', '[').replace(',]', ']')  # Remove commas next to brackets
        s = re.sub(r',+', ',', s)  # Replace multiple commas with a single comma
        return s

    # Check if the metric column is a string or a list and apply appropriate transformation
    if isinstance(df[metric].iloc[0], str):
        df[metric] = df[metric].apply(lambda x: fix_string_list(x) if isinstance(x, str) else x)
        df[metric] = df[metric].apply(ast.literal_eval)
    elif isinstance(df[metric].iloc[0], list):
        pass  # Do nothing if it's already a list
    else:
        raise ValueError("The metric column is neither a string nor a list")
    return df

In [10]:
def plotit(df, metric):
    # Explode the 'metric' list into separate rows
    df = df.explode(metric)

    # Convert metric column to numeric type
    df[metric] = pd.to_numeric(df[metric])

    # Create a new column combining 'Estimator' and 'Inner_Selection'
    df['Estimator_Selection'] = df['Estimator'] + ' (' + df['Inner_Selection'] + ')'

    df = df.sort_values('Estimator_Selection')

    # Create the interactive boxplot
    fig = px.box(df, x='Estimator_Selection', y=metric, color='Estimator_Selection')

    # Customize the layout
    fig.update_layout(
        title=f'Boxplot of {metric.capitalize()} by Estimator and Inner Selection',
        xaxis_title='Estimator and Inner Selection',
        yaxis_title=metric.capitalize(),
        yaxis=dict(range=[0, 1.1]),  # Assuming metric values are between 0 and 1
        xaxis=dict(tickangle=-45),
        height=1000,
        width=1500,
        legend_title_text='Estimator Selection'
    )

    fig.show()

In [13]:
dataset_name = 'epic_lc_ms_pos'
excel_dir = 'Results/'
metric = 'Outer_matthews_corrcoef'


final = load_excel_to_dataframe(excel_dir, dataset_name)
final = fix_format(final, metric)

plotit(final, metric)

In [12]:
final.Outer_matthews_corrcoef[0]

[0.375,
 0.6875,
 0.5625,
 0.6,
 0.4666666666666667,
 0.5,
 0.5,
 0.4375,
 0.4666666666666667,
 0.4666666666666667,
 0.625,
 0.5,
 0.4375,
 0.6666666666666666,
 0.4666666666666667,
 0.5625,
 0.625,
 0.625,
 0.6,
 0.6,
 0.5625,
 0.625,
 0.4375,
 0.5333333333333333,
 0.4666666666666667,
 0.5625,
 0.5625,
 0.625,
 0.4666666666666667,
 0.4666666666666667,
 0.5625,
 0.6875,
 0.75,
 0.6,
 0.6,
 0.5,
 0.5625,
 0.5,
 0.6,
 0.6,
 0.5625,
 0.4375,
 0.5625,
 0.7333333333333333,
 0.5333333333333333,
 0.375,
 0.375,
 0.5625,
 0.5333333333333333,
 0.6666666666666666]