## Set the path to dataset

In [None]:
import os
import numpy as np
import pycatrobin.data.extract as ex
import pycatrobin.analysis.data_analysis as da

# Define the paths to datasets
path_Rh_loading = "../dataset/refined/alldata_Rh_loading"
path_synth = "../dataset/refined/alldata_synth"
path_temp = "../dataset/refined/alldata_temp"
path_all = "../dataset/refined/alldata"

## Definition of a function to plot violin plot for each feature impact 

In [2]:
def plot(
        exclude_keywords: list,
        prefix: str,
        average_same_location: bool,
        path=path_all, path_all=path_all,
        snr: bool = True,
        feature_impact: bool = True,
        colors: list = None
):
    exclude_keywords_all = []

    # Create an instance of DataForML
    dataset = ex.DataForML(path=path)
    dataset.find_excel_files()
    dataset.filter_excel_files(exclude_keywords=exclude_keywords, verbose=True)
    dataset.construct_dataframe(extensive=False)
    # dataset.convert_measured_to_nominal(which_column="Rh_total_mass")
    dataset.convert_measured_to_nominal(which_column="Rh_total_mass", allowed_values=np.array([0.02])) # for Round Robin data
    dataset.apply_duplicate_groupid(
        exclude_columns=['filename', 'experiment_date', 'location'],
        verbose=False
    )
    # Create an instance of DataForML for 'all data'
    dataset_all = ex.DataForML(path=path_all)
    dataset_all.find_excel_files()
    dataset_all.filter_excel_files(exclude_keywords=exclude_keywords_all, verbose=True)
    dataset_all.construct_dataframe(extensive=False)
    # dataset.convert_measured_to_nominal(which_column="Rh_total_mass")
    dataset_all.convert_measured_to_nominal(which_column="Rh_total_mass", allowed_values=np.array([0.02])) # for Round Robin data
    dataset_all.apply_duplicate_groupid(
        exclude_columns=['filename', 'experiment_date', 'location'],
        verbose=False
    )

    # Calculate and add target values into the DataFrame
    savgol=False
    methods=[
                'AUC',
                'final value',
                'initial value',
                'final slope',
                'initial slope',
                'overall slope',
            ]
    for column in [
       # 'CO2 Conversion (%)',
       # 'CH4 Net Production Rate (mol/molRh/s)',
       'CO Net Production Rate (mol/molRh/s)',
       # 'Selectivity to CO (%)'
        ]:
        dataset.assign_target_values(
            savgol=savgol, methods=methods,
            column=column, temp_threshold=3.5, init_tos_buffer=0.5, adjacency_slope=1.0,
            )
        dataset_all.assign_target_values(
            savgol=savgol, methods=methods,
            column=column, temp_threshold=3.5, init_tos_buffer=0.5, adjacency_slope=1.0,
            )

    # Construct unique DataFrame using group IDs
    dataset.construct_unique_dataframe(verbose=False)
    dataset_all.construct_unique_dataframe(verbose=False)

    # Create an instance of DataAnalysis ------
    analysis = da.DataAnalysis(dataset=dataset)
    # Calculate statistics DataFrame on the basis of GroupID; it determines which data to use as entire dataset
    analysis.calculate_statistics_duplicate_group(
        dataset_all=dataset_all,
        total='duplicate',
        verbose=False,
        average_same_location=average_same_location
    )

    # Plot heatmap of SNR values/Standard Deviation
    if snr:
        analysis.plot_heatmap(
            methods=methods, # to show the rows in a defined order
            properties=[
            # 'CH4 Net Production Rate (mol/molRh/s)',
            'CO Net Production Rate (mol/molRh/s)',
            # 'CO2 Conversion (%)',
            # 'Selectivity to CO (%)'
            ],
            which_to_plot='snr',
            snr_type='mu_sigma',
            cmap='Reds',
            vmax=5.3,# vmin=0.0,
            save_fig=True,
            prefix=prefix
            )

    if feature_impact:
        analysis.compare_targets_std_dev(
            target_wise=True,
            # snr_type='range', #'std_dev',
            snr_type='mu_sigma',
            plot_hist=True,  # False,
            save_fig=True,
            prefix=prefix,
            colors=colors
        )

* (below) Once the variability window pops up, click the panel corresponding to the target metric of interest to save the violin plot figure.

In [6]:
%matplotlib qt

# Violin plot for temperature effect
colors = ['#69BADD', '#ED712E', '#721495'] # customized colors for colorblindness
plot(exclude_keywords_overall, 'temp', average_same_location=False, path=path_temp, snr=False, feature_impact=True, colors=colors)

14 excel files were found:
0 files were filtered out:
data indexed 0 is not nominal:  0.0204 -> 0.02
data indexed 2 is not nominal:  0.027000000000000003 -> 0.02
data indexed 5 is not nominal:  0.021 -> 0.02
data indexed 9 is not nominal:  0.01824 -> 0.02
data indexed 12 is not nominal:  0.0202 -> 0.02
32 excel files were found:
0 files were filtered out:
data indexed 0 is not nominal:  0.020099999999999996 -> 0.02
data indexed 1 is not nominal:  0.0204 -> 0.02
data indexed 4 is not nominal:  0.027000000000000003 -> 0.02
data indexed 5 is not nominal:  0.020099999999999996 -> 0.02
data indexed 9 is not nominal:  0.0199 -> 0.02
data indexed 10 is not nominal:  0.0197 -> 0.02
data indexed 11 is not nominal:  0.0204 -> 0.02
data indexed 13 is not nominal:  0.020099999999999996 -> 0.02
data indexed 14 is not nominal:  0.0207 -> 0.02
data indexed 15 is not nominal:  0.021 -> 0.02
data indexed 16 is not nominal:  0.0204 -> 0.02
data indexed 17 is not nominal:  0.020099999999999996 -> 0.02
da

In [7]:
%matplotlib qt

# Violin plot for Rh loading effect
colors = ['#C6BB68', '#2A60DD', '#ED712E']
plot(exclude_keywords_overall, 'Rh_loading', average_same_location=False, path=path_Rh_loading, snr=False, feature_impact=True, colors=colors)

18 excel files were found:
0 files were filtered out:
data indexed 2 is not nominal:  0.0204 -> 0.02
data indexed 4 is not nominal:  0.020099999999999996 -> 0.02
data indexed 5 is not nominal:  0.0207 -> 0.02
data indexed 9 is not nominal:  0.01824 -> 0.02
data indexed 15 is not nominal:  0.0202 -> 0.02
data indexed 17 is not nominal:  0.0199 -> 0.02
32 excel files were found:
0 files were filtered out:
data indexed 0 is not nominal:  0.020099999999999996 -> 0.02
data indexed 1 is not nominal:  0.0204 -> 0.02
data indexed 4 is not nominal:  0.027000000000000003 -> 0.02
data indexed 5 is not nominal:  0.020099999999999996 -> 0.02
data indexed 9 is not nominal:  0.0199 -> 0.02
data indexed 10 is not nominal:  0.0197 -> 0.02
data indexed 11 is not nominal:  0.0204 -> 0.02
data indexed 13 is not nominal:  0.020099999999999996 -> 0.02
data indexed 14 is not nominal:  0.0207 -> 0.02
data indexed 15 is not nominal:  0.021 -> 0.02
data indexed 16 is not nominal:  0.0204 -> 0.02
data indexed 17

In [8]:
%matplotlib qt

# Violin plot for synthesis method effect
colors = ['#BBBBBB', '#2A60DD']
plot(exclude_keywords_overall, 'synth', average_same_location=False, path=path_synth, snr=False, feature_impact=True, colors=colors)

12 excel files were found:
0 files were filtered out:
data indexed 0 is not nominal:  0.020099999999999996 -> 0.02
data indexed 1 is not nominal:  0.020099999999999996 -> 0.02
data indexed 2 is not nominal:  0.0199 -> 0.02
data indexed 3 is not nominal:  0.0197 -> 0.02
data indexed 4 is not nominal:  0.0204 -> 0.02
data indexed 6 is not nominal:  0.020099999999999996 -> 0.02
data indexed 7 is not nominal:  0.0207 -> 0.02
data indexed 8 is not nominal:  0.0204 -> 0.02
data indexed 9 is not nominal:  0.020099999999999996 -> 0.02
data indexed 11 is not nominal:  0.0199 -> 0.02
32 excel files were found:
0 files were filtered out:
data indexed 0 is not nominal:  0.020099999999999996 -> 0.02
data indexed 1 is not nominal:  0.0204 -> 0.02
data indexed 4 is not nominal:  0.027000000000000003 -> 0.02
data indexed 5 is not nominal:  0.020099999999999996 -> 0.02
data indexed 9 is not nominal:  0.0199 -> 0.02
data indexed 10 is not nominal:  0.0197 -> 0.02
data indexed 11 is not nominal:  0.0204 