In [1]:
import os
import pandas as pd
import os
import pandas as pd
import numpy as np
from scipy import stats
import json

# Set1 model large dataset RNA binders vs Protein binders

In [2]:
# df_gated = pd.read_csv('set1_ml_models/gatedgraphconv/test_results_GatedGraphConv.csv')
# df_gatv2 = pd.read_csv('set1_ml_models/gatv2conv/test_results_GATv2Conv.csv')
# df_sage = pd.read_csv('set1_ml_models/sageconv/test_results_SageConv.csv')


In [3]:
# Define the base directory where CSV files are located
base_dir = 'set1_ml_models'

# Models and the associated directories
models = {
    'GatedGraphConv': 'gatedgraphconv',
    'GATv2Conv': 'gatv2conv',
    'SageConv': 'sageconv'
}

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each model and their associated directory
for model, dir_name in models.items():
    # Construct the path to the CSV file
    csv_path = os.path.join(base_dir, dir_name, f'test_results_{model}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Add a column for the model
        df['Model'] = model
        # Append the DataFrame to the list
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)
# rename col F1-score to F1 score
all_data.rename(columns={'F1-Score': 'F1 Score'}, inplace=True)
all_data.rename(columns={'ROC-AUC': 'ROC AUC'}, inplace=True)
# Display the combined DataFrame
all_data

Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,3,0.293396,0.873934,0.874418,0.873289,0.873853,0.873934,GatedGraphConv
1,7,0.328256,0.857401,0.865136,0.84681,0.855875,0.857401,GatedGraphConv
2,1,0.318745,0.860049,0.865191,0.85301,0.859057,0.860049,GatedGraphConv
3,5,0.291239,0.876195,0.854362,0.907001,0.879895,0.876195,GatedGraphConv
4,4,0.285744,0.878714,0.854449,0.912942,0.882728,0.878714,GatedGraphConv
5,9,0.283506,0.881361,0.887416,0.873547,0.880427,0.881361,GatedGraphConv
6,2,0.3084,0.872255,0.88581,0.854689,0.869971,0.872255,GatedGraphConv
7,10,0.30482,0.877874,0.889911,0.862439,0.875959,0.877874,GatedGraphConv
8,8,0.300888,0.866249,0.890834,0.834797,0.861906,0.866249,GatedGraphConv
9,6,0.271804,0.888724,0.884601,0.894084,0.889317,0.888724,GatedGraphConv


In [4]:
large_gnn = all_data.copy()

In [5]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['Run ID']
model = best_model_row['Model']

print(f"Set1: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set1: Best GatedGraphConv with the highest accuracy is from run 6 with an accuracy of 0.8887238439679669.


In [6]:
# Construct the path to the JSON file where the model's parameters are saved
best_model_dir = os.path.join(base_dir, f'sageconv/sample_{best_model_run_id:03d}')
best_model_json_path = os.path.join('set1_ml_models/sageconv/sage_best_hyperparams_run_4.json')

# Load the JSON file
with open(best_model_json_path, 'r') as file:
    model_params = json.load(file)

# Print the model parameters
print(f"Set1: Best GNN model parameters:")
for param, value in model_params.items():
    print(f"{param}: {value}")

Set1: Best GNN model parameters:
hidden_dim: 254
aggregator_type: mean
dropout_rate: 0.035695885234456326
lr: 0.002028717973972058
batch_size: 128


In [7]:
# Models and metrics to analyze
models = ['GatedGraphConv', 'GATv2Conv', 'SageConv']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[ht]\n"
    "\\centering\n"
    "\\caption{RNA Binders vs Protein Binders (Merged Dataset)}\n"
    "\\label{tab:model_performance}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)


\begin{table}[ht]
\centering
\caption{RNA Binders vs Protein Binders (Merged Dataset)}
\label{tab:model_performance}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
GatedGraphConv & \textbf{87.33 $\pm$ 0.97\%} & \textbf{87.52 $\pm$ 1.44\%} & 87.13 $\pm$ 2.62\% & \textbf{87.29 $\pm$ 1.10\%} & \textbf{87.33 $\pm$ 0.97\%} \\
GATv2Conv & 82.98 $\pm$ 1.88\% & 82.18 $\pm$ 2.72\% & 84.34 $\pm$ 1.29\% & 83.23 $\pm$ 1.65\% & 82.98 $\pm$ 1.88\% \\
SageConv & 86.53 $\pm$ 0.64\% & 85.96 $\pm$ 1.40\% & \textbf{87.37 $\pm$ 1.56\%} & 86.64 $\pm$ 0.62\% & 86.53 $\pm$ 0.64\% \\\bottomrule
\end{tabularx}
\end{table}



# Set2 model small datasets model1 RNA binders RNA nonbinders (ROBIN)

In [8]:
# Define the base directory where CSV files are located
base_dir = 'set2_ml_models/model1_rna_b_nb'

# Models and the associated directories
models = {
    'GatedGraphConv': 'gatedgraphconv',
    'GATv2Conv': 'gatv2conv',
    'SageConv': 'sageconv'
}

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each model and their associated directory
for model, dir_name in models.items():
    # Construct the path to the CSV file
    csv_path = os.path.join(base_dir, dir_name, f'test_results_{model}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Add a column for the model
        df['Model'] = model
        # Append the DataFrame to the list
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)
# rename col F1-score to F1 score
all_data.rename(columns={'F1-Score': 'F1 Score'}, inplace=True)
all_data.rename(columns={'ROC-AUC': 'ROC AUC'}, inplace=True)
# Display the combined DataFrame
all_data

Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,10,0.652818,0.620382,0.648903,0.526718,0.581461,0.620502,GatedGraphConv
1,1,0.6384,0.629299,0.660377,0.534351,0.590717,0.62942,GatedGraphConv
2,2,0.64584,0.630573,0.626238,0.645408,0.635678,0.630592,GatedGraphConv
3,3,0.679409,0.564331,0.542088,0.821429,0.653144,0.564658,GatedGraphConv
4,4,0.660885,0.598726,0.584416,0.687023,0.631579,0.598613,GatedGraphConv
5,5,0.654,0.610191,0.593148,0.704835,0.644186,0.61007,GatedGraphConv
6,6,0.650367,0.596178,0.606232,0.545918,0.574497,0.596114,GatedGraphConv
7,7,0.646491,0.607643,0.603448,0.625,0.614035,0.607665,GatedGraphConv
8,8,0.65413,0.626752,0.660194,0.520408,0.582026,0.626616,GatedGraphConv
9,9,0.660795,0.614013,0.655052,0.479592,0.553756,0.613842,GatedGraphConv


In [9]:
small_2_1 = all_data.copy()

In [10]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['Run ID']
model = best_model_row['Model']

print(f"Set1: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set1: Best SageConv with the highest accuracy is from run 2 with an accuracy of 0.6611464968152866.


In [11]:
best_model_dir = os.path.join(base_dir, f'sageconv/sample_{best_model_run_id:03d}')
best_model_json_path = os.path.join('set2_ml_models/model1_rna_b_nb/sageconv/sage_best_hyperparams_run_2.json')

# Load the JSON file
with open(best_model_json_path, 'r') as file:
    model_params = json.load(file)

# Print the model parameters
print(f"Set1: Best GNN model parameters:")
for param, value in model_params.items():
    print(f"{param}: {value}")

Set1: Best GNN model parameters:
hidden_dim: 160
aggregator_type: lstm
dropout_rate: 0.008264390145987899
lr: 0.0006753929052352515
batch_size: 256


In [12]:
# Models and metrics to analyze
models = ['GatedGraphConv', 'GATv2Conv', 'SageConv']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[H]\n"
    "\\centering\n"
    "\\caption{GNN models on RNA Binders vs RNA Non-Binders ($Set2\_Small$)}\n"
    "\\label{tab:model_performance2}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)


\begin{table}[H]
\centering
\caption{GNN models on RNA Binders vs RNA Non-Binders ($Set2\_Small$)}
\label{tab:model_performance2}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
GatedGraphConv & 60.98 $\pm$ 2.00\% & 61.80 $\pm$ 3.92\% & 60.91 $\pm$ 10.69\% & 60.61 $\pm$ 3.40\% & 60.98 $\pm$ 2.00\% \\
GATv2Conv & 60.13 $\pm$ 2.92\% & 60.88 $\pm$ 5.71\% & \textbf{61.62 $\pm$ 8.74\%} & 60.51 $\pm$ 2.55\% & 60.13 $\pm$ 2.91\% \\
SageConv & \textbf{61.78 $\pm$ 2.32\%} & \textbf{62.70 $\pm$ 3.38\%} & 59.40 $\pm$ 6.98\% & \textbf{60.69 $\pm$ 3.31\%} & \textbf{61.78 $\pm$ 2.32\%} \\\bottomrule
\end{tabularx}
\end{table}



# Set2 model small dataset model2 RNA binder (ROBIN) vs Protein Binder (Probes & Drugs)

In [13]:
# Define the base directory where CSV files are located
base_dir = 'set2_ml_models/model2_rna_b_prot_b'

# Models and the associated directories
models = {
    'GatedGraphConv': 'gatedgraphconv',
    'GATv2Conv': 'gatv2conv',
    'SageConv': 'sageconv'
}

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each model and their associated directory
for model, dir_name in models.items():
    # Construct the path to the CSV file
    csv_path = os.path.join(base_dir, dir_name, f'test_results_{model}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Add a column for the model
        df['Model'] = model
        # Append the DataFrame to the list
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)
# rename col F1-score to F1 score
all_data.rename(columns={'F1-Score': 'F1 Score'}, inplace=True)
all_data.rename(columns={'ROC-AUC': 'ROC AUC'}, inplace=True)
# Display the combined DataFrame
all_data

Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,10,0.263752,0.907006,0.906091,0.908397,0.907243,0.907005,GatedGraphConv
1,1,0.277084,0.887898,0.880299,0.898219,0.889169,0.887885,GatedGraphConv
2,2,0.35352,0.848408,0.90991,0.772959,0.835862,0.848312,GatedGraphConv
3,3,0.357674,0.867516,0.911429,0.813776,0.859838,0.867448,GatedGraphConv
4,4,0.288136,0.900637,0.92,0.877863,0.898438,0.900666,GatedGraphConv
5,5,0.340909,0.872611,0.867168,0.880407,0.873737,0.872602,GatedGraphConv
6,6,0.441551,0.857325,0.880435,0.826531,0.852632,0.857286,GatedGraphConv
7,7,0.514383,0.727389,0.709906,0.767857,0.737745,0.72744,GatedGraphConv
8,8,0.323726,0.887898,0.870732,0.910714,0.890274,0.887927,GatedGraphConv
9,9,0.413499,0.821656,0.925676,0.69898,0.796512,0.8215,GatedGraphConv


In [14]:
small_2_2 = all_data.copy()

In [15]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['Run ID']
model = best_model_row['Model']

print(f"Set1: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set1: Best GatedGraphConv with the highest accuracy is from run 10 with an accuracy of 0.9070063694267516.


In [16]:
best_model_dir = os.path.join(base_dir, f'sageconv/sample_{best_model_run_id:03d}')
best_model_json_path = os.path.join('set2_ml_models/model2_rna_b_prot_b/gatedgraphconv/best_hyperparams_GatedGraphConv_10.json')

# Load the JSON file
with open(best_model_json_path, 'r') as file:
    model_params = json.load(file)

# Print the model parameters
print(f"Set1: Best GNN model parameters:")
for param, value in model_params.items():
    print(f"{param}: {value}")

Set1: Best GNN model parameters:
n_steps: 2
hidden_dim: 187
lr: 0.0008763688326293078
batch_size: 128
dropout_rate: 0.36540507458496707


In [17]:
# Models and metrics to analyze
models = ['GatedGraphConv', 'GATv2Conv', 'SageConv']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[H]\n"
    "\\centering\n"
    "\\caption{GNN models on RNA Binders vs Protein Binders ($Set2\_Small$)}\n"
    "\\label{tab:model_performance1}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)


\begin{table}[H]
\centering
\caption{GNN models on RNA Binders vs Protein Binders ($Set2\_Small$)}
\label{tab:model_performance1}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
GatedGraphConv & 85.78 $\pm$ 5.25\% & \textbf{87.82 $\pm$ 6.27\%} & 83.56 $\pm$ 7.18\% & 85.41 $\pm$ 5.26\% & 85.78 $\pm$ 5.25\% \\
GATv2Conv & 83.96 $\pm$ 3.74\% & 83.20 $\pm$ 4.67\% & 85.35 $\pm$ 3.65\% & 84.20 $\pm$ 3.53\% & 83.96 $\pm$ 3.74\% \\
SageConv & \textbf{87.19 $\pm$ 1.60\%} & 87.70 $\pm$ 2.02\% & \textbf{86.61 $\pm$ 3.70\%} & \textbf{87.09 $\pm$ 1.82\%} & \textbf{87.19 $\pm$ 1.60\%} \\\bottomrule
\end{tabularx}
\end{table}



# Set2 model small dataset model3 RNA Binder (ROBIN) vs Non-Binders (Merged small)

In [18]:
# Define the base directory where CSV files are located
base_dir = 'set2_ml_models/model3_binder_nonbinder'

# Models and the associated directories
models = {
    'GatedGraphConv': 'gatedgraphconv',
    'GATv2Conv': 'gatv2conv',
    'SageConv': 'sageconv'
}

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each model and their associated directory
for model, dir_name in models.items():
    # Construct the path to the CSV file
    csv_path = os.path.join(base_dir, dir_name, f'test_results_{model}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Add a column for the model
        df['Model'] = model
        # Append the DataFrame to the list
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)
# rename col F1-score to F1 score
all_data.rename(columns={'F1-Score': 'F1 Score'}, inplace=True)
all_data.rename(columns={'ROC-AUC': 'ROC AUC'}, inplace=True)
# Display the combined DataFrame
all_data

Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,10,0.491645,0.764331,0.724138,0.854962,0.784131,0.764216,GatedGraphConv
1,1,0.57201,0.722293,0.702079,0.773537,0.736077,0.722228,GatedGraphConv
2,2,0.534359,0.765605,0.76943,0.757653,0.763496,0.765595,GatedGraphConv
3,3,0.538557,0.745223,0.713333,0.818878,0.76247,0.745317,GatedGraphConv
4,4,0.659731,0.616561,0.606977,0.664122,0.634265,0.6165,GatedGraphConv
5,5,0.518339,0.749045,0.708511,0.847328,0.771727,0.748919,GatedGraphConv
6,6,0.700063,0.499363,0.499363,1.0,0.6661,0.5,GatedGraphConv
7,7,0.471273,0.747771,0.708155,0.841837,0.769231,0.74789,GatedGraphConv
8,8,0.502531,0.750318,0.702479,0.867347,0.776256,0.750467,GatedGraphConv
9,9,0.523087,0.756688,0.70303,0.887755,0.784667,0.756855,GatedGraphConv


In [19]:
small_2_3 = all_data.copy()


In [20]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['Run ID']
model = best_model_row['Model']

print(f"Set1: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set1: Best SageConv with the highest accuracy is from run 10 with an accuracy of 0.7770700636942676.


In [21]:
best_model_dir = os.path.join(base_dir, f'sageconv/sample_{best_model_run_id:03d}')
best_model_json_path = os.path.join('set2_ml_models/model2_rna_b_prot_b/sageconv/sage_best_hyperparams_run_10.json')

# Load the JSON file
with open(best_model_json_path, 'r') as file:
    model_params = json.load(file)

# Print the model parameters
print(f"Set1: Best GNN model parameters:")
for param, value in model_params.items():
    print(f"{param}: {value}")

Set1: Best GNN model parameters:
hidden_dim: 124
aggregator_type: mean
dropout_rate: 0.07518822477964826
lr: 0.0013725395748444443
batch_size: 128


In [22]:
# Models and metrics to analyze
models = ['GatedGraphConv', 'GATv2Conv', 'SageConv']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[H]\n"
    "\\centering\n"
    "\\caption{GNN models on RNA-Binders vs Non-Binders ($Set2\_Small$)}\n"
    "\\label{tab:model_performance3}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)


\begin{table}[H]
\centering
\caption{GNN models on RNA-Binders vs Non-Binders ($Set2\_Small$)}
\label{tab:model_performance3}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
GatedGraphConv & 71.17 $\pm$ 8.64\% & 68.37 $\pm$ 7.60\% & \textbf{83.13 $\pm$ 8.87\%} & 74.48 $\pm$ 5.23\% & 71.18 $\pm$ 8.63\% \\
GATv2Conv & 72.15 $\pm$ 4.32\% & 69.85 $\pm$ 3.84\% & 78.03 $\pm$ 5.48\% & 73.66 $\pm$ 4.17\% & 72.15 $\pm$ 4.32\% \\
SageConv & \textbf{74.68 $\pm$ 1.20\%} & \textbf{71.98 $\pm$ 1.21\%} & 80.97 $\pm$ 5.01\% & \textbf{76.12 $\pm$ 1.86\%} & \textbf{74.68 $\pm$ 1.20\%} \\\bottomrule
\end{tabularx}
\end{table}



# All results

In [23]:
large_gnn


Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,3,0.293396,0.873934,0.874418,0.873289,0.873853,0.873934,GatedGraphConv
1,7,0.328256,0.857401,0.865136,0.84681,0.855875,0.857401,GatedGraphConv
2,1,0.318745,0.860049,0.865191,0.85301,0.859057,0.860049,GatedGraphConv
3,5,0.291239,0.876195,0.854362,0.907001,0.879895,0.876195,GatedGraphConv
4,4,0.285744,0.878714,0.854449,0.912942,0.882728,0.878714,GatedGraphConv
5,9,0.283506,0.881361,0.887416,0.873547,0.880427,0.881361,GatedGraphConv
6,2,0.3084,0.872255,0.88581,0.854689,0.869971,0.872255,GatedGraphConv
7,10,0.30482,0.877874,0.889911,0.862439,0.875959,0.877874,GatedGraphConv
8,8,0.300888,0.866249,0.890834,0.834797,0.861906,0.866249,GatedGraphConv
9,6,0.271804,0.888724,0.884601,0.894084,0.889317,0.888724,GatedGraphConv


In [24]:
small_2_1


Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,10,0.652818,0.620382,0.648903,0.526718,0.581461,0.620502,GatedGraphConv
1,1,0.6384,0.629299,0.660377,0.534351,0.590717,0.62942,GatedGraphConv
2,2,0.64584,0.630573,0.626238,0.645408,0.635678,0.630592,GatedGraphConv
3,3,0.679409,0.564331,0.542088,0.821429,0.653144,0.564658,GatedGraphConv
4,4,0.660885,0.598726,0.584416,0.687023,0.631579,0.598613,GatedGraphConv
5,5,0.654,0.610191,0.593148,0.704835,0.644186,0.61007,GatedGraphConv
6,6,0.650367,0.596178,0.606232,0.545918,0.574497,0.596114,GatedGraphConv
7,7,0.646491,0.607643,0.603448,0.625,0.614035,0.607665,GatedGraphConv
8,8,0.65413,0.626752,0.660194,0.520408,0.582026,0.626616,GatedGraphConv
9,9,0.660795,0.614013,0.655052,0.479592,0.553756,0.613842,GatedGraphConv


In [25]:
small_2_2


Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,10,0.263752,0.907006,0.906091,0.908397,0.907243,0.907005,GatedGraphConv
1,1,0.277084,0.887898,0.880299,0.898219,0.889169,0.887885,GatedGraphConv
2,2,0.35352,0.848408,0.90991,0.772959,0.835862,0.848312,GatedGraphConv
3,3,0.357674,0.867516,0.911429,0.813776,0.859838,0.867448,GatedGraphConv
4,4,0.288136,0.900637,0.92,0.877863,0.898438,0.900666,GatedGraphConv
5,5,0.340909,0.872611,0.867168,0.880407,0.873737,0.872602,GatedGraphConv
6,6,0.441551,0.857325,0.880435,0.826531,0.852632,0.857286,GatedGraphConv
7,7,0.514383,0.727389,0.709906,0.767857,0.737745,0.72744,GatedGraphConv
8,8,0.323726,0.887898,0.870732,0.910714,0.890274,0.887927,GatedGraphConv
9,9,0.413499,0.821656,0.925676,0.69898,0.796512,0.8215,GatedGraphConv


In [26]:
small_2_3


Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,10,0.491645,0.764331,0.724138,0.854962,0.784131,0.764216,GatedGraphConv
1,1,0.57201,0.722293,0.702079,0.773537,0.736077,0.722228,GatedGraphConv
2,2,0.534359,0.765605,0.76943,0.757653,0.763496,0.765595,GatedGraphConv
3,3,0.538557,0.745223,0.713333,0.818878,0.76247,0.745317,GatedGraphConv
4,4,0.659731,0.616561,0.606977,0.664122,0.634265,0.6165,GatedGraphConv
5,5,0.518339,0.749045,0.708511,0.847328,0.771727,0.748919,GatedGraphConv
6,6,0.700063,0.499363,0.499363,1.0,0.6661,0.5,GatedGraphConv
7,7,0.471273,0.747771,0.708155,0.841837,0.769231,0.74789,GatedGraphConv
8,8,0.502531,0.750318,0.702479,0.867347,0.776256,0.750467,GatedGraphConv
9,9,0.523087,0.756688,0.70303,0.887755,0.784667,0.756855,GatedGraphConv


In [None]:
# you do not have to extract the data, create one data structure where you will keep all the results and upon all the results you will perform the analysis, here are the provided results divided to ensamble and GNN results: name of the tables you can create how do you want, and all these provided tables are pandas dataframes:

# ENSAMBLE:
# large_en
# Model	Accuracy	Precision	Recall	F1 Score	ROC AUC	run_id
# 0	LightGBM	0.886657	0.891768	0.880134	0.885913	0.952861	1
# 1	XGBoost	0.885688	0.893205	0.876130	0.884585	0.954190	1
# 2	Random Forest	0.868316	0.888859	0.841901	0.864743	0.942123	1
# 3	XGBoost	0.888659	0.894158	0.881684	0.887877	0.955541	2
# 4	LightGBM	0.884784	0.889208	0.879101	0.884126	0.953526	2
# 5	Random Forest	0.869478	0.887130	0.846680	0.866433	0.942562	2
# 6	XGBoost	0.882201	0.889035	0.873418	0.881157	0.950897	3
# 7	LightGBM	0.879036	0.889967	0.865022	0.877317	0.947109	3
# 8	Random Forest	0.861405	0.883078	0.833118	0.857371	0.938457	3
# 9	LightGBM	0.887755	0.892418	0.881813	0.887084	0.955327	4
# 10	XGBoost	0.884461	0.894918	0.871222	0.882911	0.953811	4
# 11	Random Forest	0.863730	0.874767	0.849005	0.861694	0.939616	4
# 12	LightGBM	0.886593	0.890731	0.881297	0.885989	0.955495	5
# 13	XGBoost	0.885688	0.888398	0.882201	0.885288	0.955937	5
# 14	Random Forest	0.866507	0.873896	0.856626	0.865175	0.941865	5
# 15	LightGBM	0.883751	0.885444	0.881555	0.883495	0.954107	6
# 16	XGBoost	0.883234	0.887337	0.877939	0.882613	0.951257	6
# 17	Random Forest	0.866249	0.882917	0.844485	0.863273	0.941159	6
# 18	XGBoost	0.883428	0.889209	0.876001	0.882556	0.953301	7
# 19	LightGBM	0.880199	0.882521	0.877164	0.879834	0.949809	7
# 20	Random Forest	0.863730	0.883442	0.838026	0.860135	0.940244	7
# 21	XGBoost	0.890468	0.898813	0.880005	0.889309	0.956515	8
# 22	LightGBM	0.881943	0.883329	0.880134	0.881729	0.951932	8
# 23	Random Forest	0.868897	0.875987	0.859468	0.867649	0.945801	8
# 24	XGBoost	0.883945	0.887397	0.879489	0.883425	0.955280	9
# 25	LightGBM	0.879682	0.887133	0.870059	0.878513	0.952499	9
# 26	Random Forest	0.866314	0.884178	0.843064	0.863131	0.941605	9
# 27	LightGBM	0.886076	0.888687	0.882718	0.885692	0.953686	10
# 28	XGBoost	0.878972	0.888815	0.866314	0.877420	0.951393	10
# 29	Random Forest	0.866184	0.878000	0.850555	0.864060	0.941800	10

# small_en_2_1
# Model	Accuracy	Precision	Recall	F1 Score	ROC AUC	run_id
# 0	Random Forest	0.626752	0.638655	0.581633	0.608812	0.669795	1
# 1	LightGBM	0.597452	0.603825	0.563776	0.583113	0.630001	1
# 2	XGBoost	0.589809	0.598315	0.543367	0.569519	0.650556	1
# 3	LightGBM	0.617834	0.628492	0.573980	0.600000	0.657728	2
# 4	Random Forest	0.615287	0.627119	0.566327	0.595174	0.665959	2
# 5	XGBoost	0.606369	0.612466	0.576531	0.593955	0.650075	2
# 6	Random Forest	0.625478	0.640000	0.571429	0.603774	0.673236	3
# 7	XGBoost	0.614013	0.624650	0.568878	0.595461	0.650945	3
# 8	LightGBM	0.592357	0.601695	0.543367	0.571046	0.624721	3
# 9	XGBoost	0.611465	0.620499	0.571429	0.594954	0.663850	4
# 10	Random Forest	0.608917	0.616438	0.573980	0.594452	0.668108	4
# 11	LightGBM	0.602548	0.608108	0.573980	0.590551	0.630099	4
# 12	Random Forest	0.620382	0.632768	0.571429	0.600536	0.669977	5
# 13	XGBoost	0.616561	0.625344	0.579082	0.601325	0.656287	5
# 14	LightGBM	0.584713	0.590164	0.551020	0.569921	0.645032	5
# 15	Random Forest	0.621656	0.636888	0.563776	0.598106	0.668296	6
# 16	XGBoost	0.615287	0.628571	0.561224	0.592992	0.656495	6
# 17	LightGBM	0.579618	0.583784	0.551020	0.566929	0.628557	6
# 18	Random Forest	0.617834	0.636905	0.545918	0.587912	0.671217	7
# 19	LightGBM	0.608917	0.620397	0.558673	0.587919	0.654684	7
# 20	XGBoost	0.597452	0.604396	0.561224	0.582011	0.642857	7
# 21	Random Forest	0.619108	0.632479	0.566327	0.597577	0.669672	8
# 22	XGBoost	0.602548	0.610497	0.563776	0.586207	0.663655	8
# 23	LightGBM	0.585987	0.591781	0.551020	0.570674	0.629914	8
# 24	Random Forest	0.622930	0.635593	0.573980	0.603217	0.669094	9
# 25	LightGBM	0.614013	0.623955	0.571429	0.596538	0.657485	9
# 26	XGBoost	0.612739	0.622905	0.568878	0.594667	0.661967	9
# 27	Random Forest	0.611465	0.627566	0.545918	0.583902	0.664278	10
# 28	LightGBM	0.600000	0.607735	0.561224	0.583554	0.646447	10
# 29	XGBoost	0.597452	0.605556	0.556122	0.579787	0.652613	10

# small_en_2_2
# Model	Accuracy	Precision	Recall	F1 Score	ROC AUC	run_id
# 0	LightGBM	0.884076	0.886889	0.880102	0.883483	0.949823	1
# 1	XGBoost	0.864968	0.874346	0.852041	0.863049	0.941372	1
# 2	Random Forest	0.859873	0.854271	0.867347	0.860759	0.949875	1
# 3	LightGBM	0.878981	0.887728	0.867347	0.877419	0.946143	2
# 4	XGBoost	0.870064	0.879581	0.857143	0.868217	0.944955	2
# 5	Random Forest	0.864968	0.857500	0.875000	0.866162	0.950005	2
# 6	LightGBM	0.878981	0.891821	0.862245	0.876783	0.943624	3
# 7	Random Forest	0.870064	0.862500	0.880102	0.871212	0.946523	3
# 8	XGBoost	0.867516	0.878947	0.852041	0.865285	0.942346	3
# 9	XGBoost	0.871338	0.879896	0.859694	0.869677	0.944579	4
# 10	Random Forest	0.868790	0.863980	0.875000	0.869455	0.946724	4
# 11	LightGBM	0.862420	0.867876	0.854592	0.861183	0.943521	4
# 12	LightGBM	0.877707	0.885417	0.867347	0.876289	0.944176	5
# 13	XGBoost	0.868790	0.883289	0.849490	0.866060	0.939516	5
# 14	Random Forest	0.859873	0.857868	0.862245	0.860051	0.948473	5
# 15	XGBoost	0.876433	0.881137	0.869898	0.875481	0.946980	6
# 16	LightGBM	0.873885	0.886544	0.857143	0.871595	0.946364	6
# 17	Random Forest	0.864968	0.861111	0.869898	0.865482	0.948369	6
# 18	LightGBM	0.875159	0.886842	0.859694	0.873057	0.942592	7
# 19	XGBoost	0.872611	0.888298	0.852041	0.869792	0.945766	7
# 20	Random Forest	0.863694	0.857143	0.872449	0.864728	0.947000	7
# 21	LightGBM	0.878981	0.893899	0.859694	0.876463	0.939860	8
# 22	XGBoost	0.875159	0.884817	0.862245	0.873385	0.946481	8
# 23	Random Forest	0.854777	0.845771	0.867347	0.856423	0.945721	8
# 24	XGBoost	0.877707	0.891534	0.859694	0.875325	0.945591	9
# 25	LightGBM	0.876433	0.889182	0.859694	0.874189	0.946773	9
# 26	Random Forest	0.864968	0.859296	0.872449	0.865823	0.945656	9
# 27	LightGBM	0.882803	0.888601	0.875000	0.881748	0.944053	10
# 28	XGBoost	0.867516	0.880952	0.849490	0.864935	0.943274	10
# 29	Random Forest	0.862420	0.858586	0.867347	0.862944	0.946266	10

# small_en_2_3
# Model	Accuracy	Precision	Recall	F1 Score	ROC AUC	run_id
# 0	XGBoost	0.724841	0.723350	0.727041	0.725191	0.803260	1
# 1	LightGBM	0.722293	0.710145	0.750000	0.729529	0.791913	1
# 2	Random Forest	0.721019	0.724675	0.711735	0.718147	0.800254	1
# 3	Random Forest	0.749045	0.738386	0.770408	0.754057	0.801705	2
# 4	XGBoost	0.724841	0.715686	0.744898	0.730000	0.795964	2
# 5	LightGBM	0.718471	0.718670	0.716837	0.717752	0.803364	2
# 6	Random Forest	0.732484	0.727500	0.742347	0.734848	0.798697	3
# 7	XGBoost	0.719745	0.711823	0.737245	0.724311	0.797424	3
# 8	LightGBM	0.717197	0.709360	0.734694	0.721805	0.792056	3
# 9	Random Forest	0.742675	0.736318	0.755102	0.745592	0.803013	4
# 10	XGBoost	0.722293	0.712195	0.744898	0.728180	0.799787	4
# 11	LightGBM	0.715924	0.704600	0.742347	0.722981	0.784468	4
# 12	Random Forest	0.741401	0.732187	0.760204	0.745932	0.798093	5
# 13	XGBoost	0.726115	0.713253	0.755102	0.733581	0.799469	5
# 14	LightGBM	0.710828	0.696897	0.744898	0.720099	0.787155	5
# 15	Random Forest	0.742675	0.736318	0.755102	0.745592	0.802163	6
# 16	XGBoost	0.737580	0.730198	0.752551	0.741206	0.803928	6
# 17	LightGBM	0.700637	0.686461	0.737245	0.710947	0.776594	6
# 18	Random Forest	0.741401	0.736842	0.750000	0.743363	0.801598	7
# 19	XGBoost	0.736306	0.728395	0.752551	0.740276	0.803461	7
# 20	LightGBM	0.717197	0.713568	0.724490	0.718987	0.799693	7
# 21	Random Forest	0.724841	0.725641	0.721939	0.723785	0.799508	8
# 22	XGBoost	0.721019	0.711491	0.742347	0.726592	0.802078	8
# 23	LightGBM	0.719745	0.710784	0.739796	0.725000	0.802767	8
# 24	LightGBM	0.733758	0.717340	0.770408	0.742927	0.794490	9
# 25	Random Forest	0.729936	0.728426	0.732143	0.730280	0.795483	9
# 26	XGBoost	0.723567	0.712895	0.747449	0.729763	0.797905	9
# 27	Random Forest	0.733758	0.736434	0.727041	0.731707	0.795289	10
# 28	XGBoost	0.728662	0.716707	0.755102	0.735404	0.801371	10
# 29	LightGBM	0.726115	0.712230	0.757653	0.734240	0.792238	10

# GNN:
# large_gnn
# 	Run ID	Test Loss	Accuracy	Precision	Recall	F1 Score	ROC AUC	Model
# 0	3	0.293396	0.873934	0.874418	0.873289	0.873853	0.873934	GatedGraphConv
# 1	7	0.328256	0.857401	0.865136	0.846810	0.855875	0.857401	GatedGraphConv
# 2	1	0.318745	0.860049	0.865191	0.853010	0.859057	0.860049	GatedGraphConv
# 3	5	0.291239	0.876195	0.854362	0.907001	0.879895	0.876195	GatedGraphConv
# 4	4	0.285744	0.878714	0.854449	0.912942	0.882728	0.878714	GatedGraphConv
# 5	9	0.283506	0.881361	0.887416	0.873547	0.880427	0.881361	GatedGraphConv
# 6	2	0.308400	0.872255	0.885810	0.854689	0.869971	0.872255	GatedGraphConv
# 7	10	0.304820	0.877874	0.889911	0.862439	0.875959	0.877874	GatedGraphConv
# 8	8	0.300888	0.866249	0.890834	0.834797	0.861906	0.866249	GatedGraphConv
# 9	6	0.271804	0.888724	0.884601	0.894084	0.889317	0.888724	GatedGraphConv
# 10	10	0.417017	0.804185	0.776994	0.853268	0.813346	0.804185	GATv2Conv
# 11	1	0.341633	0.847778	0.841212	0.857401	0.849229	0.847778	GATv2Conv
# 12	2	0.334040	0.852493	0.849692	0.856497	0.853081	0.852493	GATv2Conv
# 13	3	0.382486	0.823172	0.802320	0.857660	0.829067	0.823172	GATv2Conv
# 14	4	0.347145	0.841514	0.845260	0.836089	0.840649	0.841514	GATv2Conv
# 15	5	0.386508	0.822333	0.815527	0.833118	0.824228	0.822333	GATv2Conv
# 16	6	0.423961	0.798889	0.786457	0.820589	0.803161	0.798889	GATv2Conv
# 17	7	0.377356	0.825304	0.815246	0.841255	0.828047	0.825304	GATv2Conv
# 18	8	0.323404	0.851718	0.855669	0.846164	0.850890	0.851718	GATv2Conv
# 19	9	0.366551	0.830858	0.830048	0.832085	0.831065	0.830858	GATv2Conv
# 20	10	0.307996	0.862762	0.846771	0.885818	0.865854	0.862762	SageConv
# 21	1	0.315688	0.865345	0.846842	0.892018	0.868843	0.865345	SageConv
# 22	2	0.320625	0.870447	0.880270	0.857530	0.868752	0.870447	SageConv
# 23	4	0.327156	0.859145	0.874579	0.838543	0.856182	0.859145	SageConv
# 24	4	0.287643	0.872901	0.872420	0.873547	0.872983	0.872901	SageConv
# 25	5	0.331425	0.853332	0.837591	0.876647	0.856674	0.853332	SageConv
# 26	6	0.301856	0.868639	0.856038	0.886334	0.870923	0.868639	SageConv
# 27	7	0.304149	0.872061	0.867270	0.878584	0.872891	0.872061	SageConv
# 28	8	0.309401	0.868251	0.862538	0.876130	0.869281	0.868251	SageConv
# 29	10	0.321687	0.859855	0.851679	0.871480	0.861466	0.859855	SageConv

# small_2_1
# Run ID	Test Loss	Accuracy	Precision	Recall	F1 Score	ROC AUC	Model
# 0	10	0.652818	0.620382	0.648903	0.526718	0.581461	0.620502	GatedGraphConv
# 1	1	0.638400	0.629299	0.660377	0.534351	0.590717	0.629420	GatedGraphConv
# 2	2	0.645840	0.630573	0.626238	0.645408	0.635678	0.630592	GatedGraphConv
# 3	3	0.679409	0.564331	0.542088	0.821429	0.653144	0.564658	GatedGraphConv
# 4	4	0.660885	0.598726	0.584416	0.687023	0.631579	0.598613	GatedGraphConv
# 5	5	0.654000	0.610191	0.593148	0.704835	0.644186	0.610070	GatedGraphConv
# 6	6	0.650367	0.596178	0.606232	0.545918	0.574497	0.596114	GatedGraphConv
# 7	7	0.646491	0.607643	0.603448	0.625000	0.614035	0.607665	GatedGraphConv
# 8	8	0.654130	0.626752	0.660194	0.520408	0.582026	0.626616	GatedGraphConv
# 9	9	0.660795	0.614013	0.655052	0.479592	0.553756	0.613842	GatedGraphConv
# 10	10	0.697589	0.574522	0.562633	0.674300	0.613426	0.574395	GATv2Conv
# 11	1	0.632906	0.652229	0.681818	0.572519	0.622407	0.652331	GATv2Conv
# 12	2	0.626302	0.640764	0.727273	0.448980	0.555205	0.640520	GATv2Conv
# 13	3	0.668492	0.601274	0.593381	0.640306	0.615951	0.601324	GATv2Conv
# 14	4	0.667448	0.591083	0.586538	0.620865	0.603214	0.591045	GATv2Conv
# 15	5	0.656072	0.601274	0.623457	0.513995	0.563459	0.601385	GATv2Conv
# 16	6	0.660970	0.603822	0.602015	0.609694	0.605830	0.603829	GATv2Conv
# 17	7	0.673880	0.554140	0.539179	0.737245	0.622845	0.554373	GATv2Conv
# 18	8	0.655826	0.611465	0.606880	0.630102	0.618273	0.611489	GATv2Conv
# 19	9	0.666808	0.582166	0.564516	0.714286	0.630631	0.582334	GATv2Conv
# 20	10	0.669233	0.589809	0.583529	0.631043	0.606357	0.589756	SageConv
# 21	1	0.646270	0.630573	0.638814	0.603053	0.620419	0.630608	SageConv
# 22	2	0.598308	0.661146	0.694444	0.573980	0.628492	0.661036	SageConv
# 23	3	0.649718	0.622930	0.614833	0.655612	0.634568	0.622972	SageConv
# 24	4	0.661069	0.592357	0.576520	0.699746	0.632184	0.592220	SageConv
# 25	5	0.626888	0.647134	0.644279	0.659033	0.651572	0.647119	SageConv
# 26	6	0.630458	0.603822	0.635452	0.484694	0.549928	0.603670	SageConv
# 27	7	0.649932	0.610191	0.641447	0.497449	0.560345	0.610048	SageConv
# 28	8	0.700046	0.617834	0.632948	0.558673	0.593496	0.617759	SageConv
# 29	9	0.659240	0.602548	0.607527	0.576531	0.591623	0.602515	SageConv

# small_2_2
# Run ID	Test Loss	Accuracy	Precision	Recall	F1 Score	ROC AUC	Model
# 0	10	0.263752	0.907006	0.906091	0.908397	0.907243	0.907005	GatedGraphConv
# 1	1	0.277084	0.887898	0.880299	0.898219	0.889169	0.887885	GatedGraphConv
# 2	2	0.353520	0.848408	0.909910	0.772959	0.835862	0.848312	GatedGraphConv
# 3	3	0.357674	0.867516	0.911429	0.813776	0.859838	0.867448	GatedGraphConv
# 4	4	0.288136	0.900637	0.920000	0.877863	0.898438	0.900666	GatedGraphConv
# 5	5	0.340909	0.872611	0.867168	0.880407	0.873737	0.872602	GatedGraphConv
# 6	6	0.441551	0.857325	0.880435	0.826531	0.852632	0.857286	GatedGraphConv
# 7	7	0.514383	0.727389	0.709906	0.767857	0.737745	0.727440	GatedGraphConv
# 8	8	0.323726	0.887898	0.870732	0.910714	0.890274	0.887927	GatedGraphConv
# 9	9	0.413499	0.821656	0.925676	0.698980	0.796512	0.821500	GatedGraphConv
# 10	10	0.332486	0.864968	0.863291	0.867684	0.865482	0.864965	GATv2Conv
# 11	1	0.298247	0.877707	0.889764	0.862595	0.875969	0.877726	GATv2Conv
# 12	2	0.331508	0.863694	0.857143	0.872449	0.864728	0.863705	GATv2Conv
# 13	3	0.360170	0.850955	0.831325	0.880102	0.855019	0.850992	GATv2Conv
# 14	4	0.386297	0.830573	0.787611	0.905852	0.842604	0.830477	GATv2Conv
# 15	5	0.456259	0.780892	0.763723	0.814249	0.788177	0.780849	GATv2Conv
# 16	6	0.367779	0.858599	0.878706	0.831633	0.854522	0.858564	GATv2Conv
# 17	7	0.336335	0.863694	0.855362	0.875000	0.865069	0.863709	GATv2Conv
# 18	8	0.459974	0.766879	0.759305	0.780612	0.769811	0.766896	GATv2Conv
# 19	9	0.402424	0.838217	0.833753	0.844388	0.839037	0.838224	GATv2Conv
# 20	10	0.288921	0.866242	0.839623	0.905852	0.871481	0.866192	SageConv
# 21	1	0.275781	0.880255	0.882353	0.877863	0.880102	0.880258	SageConv
# 22	2	0.323228	0.880255	0.896277	0.859694	0.877604	0.880229	SageConv
# 23	3	0.333340	0.889172	0.867788	0.918575	0.892460	0.889134	SageConv
# 24	5	0.312523	0.866242	0.851220	0.888041	0.869240	0.866214	SageConv
# 25	4	0.394317	0.842038	0.868132	0.806122	0.835979	0.841993	SageConv
# 26	6	0.361369	0.877707	0.900000	0.849490	0.874016	0.877671	SageConv
# 27	7	0.341622	0.878981	0.896000	0.857143	0.876141	0.878953	SageConv
# 28	4	0.383598	0.845860	0.879552	0.801020	0.838451	0.845803	SageConv
# 29	8	0.303424	0.891720	0.898701	0.882653	0.890605	0.891708	SageConv
# 30	9	0.328705	0.872611	0.866834	0.880102	0.873418	0.872621	SageConv

# small_2_3
# 	Run ID	Test Loss	Accuracy	Precision	Recall	F1 Score	ROC AUC	Model
# 0	10	0.491645	0.764331	0.724138	0.854962	0.784131	0.764216	GatedGraphConv
# 1	1	0.572010	0.722293	0.702079	0.773537	0.736077	0.722228	GatedGraphConv
# 2	2	0.534359	0.765605	0.769430	0.757653	0.763496	0.765595	GatedGraphConv
# 3	3	0.538557	0.745223	0.713333	0.818878	0.762470	0.745317	GatedGraphConv
# 4	4	0.659731	0.616561	0.606977	0.664122	0.634265	0.616500	GatedGraphConv
# 5	5	0.518339	0.749045	0.708511	0.847328	0.771727	0.748919	GatedGraphConv
# 6	6	0.700063	0.499363	0.499363	1.000000	0.666100	0.500000	GatedGraphConv
# 7	7	0.471273	0.747771	0.708155	0.841837	0.769231	0.747890	GatedGraphConv
# 8	8	0.502531	0.750318	0.702479	0.867347	0.776256	0.750467	GatedGraphConv
# 9	9	0.523087	0.756688	0.703030	0.887755	0.784667	0.756855	GatedGraphConv
# 10	10	0.505683	0.743949	0.727488	0.781170	0.753374	0.743902	GATv2Conv
# 11	1	0.570026	0.735032	0.709751	0.796438	0.750600	0.734954	GATv2Conv
# 12	2	0.555538	0.756688	0.736471	0.798469	0.766218	0.756741	GATv2Conv
# 13	3	0.510488	0.761783	0.721382	0.852041	0.781287	0.761898	GATv2Conv
# 14	4	0.586823	0.731210	0.696121	0.821883	0.753792	0.731095	GATv2Conv
# 15	5	0.547950	0.737580	0.710112	0.804071	0.754177	0.737495	GATv2Conv
# 16	6	0.566853	0.703185	0.699248	0.711735	0.705436	0.703196	GATv2Conv
# 17	7	0.638325	0.622930	0.603004	0.716837	0.655012	0.623049	GATv2Conv
# 18	8	0.643817	0.675159	0.669136	0.691327	0.680050	0.675180	GATv2Conv
# 19	9	0.516459	0.747771	0.712719	0.829082	0.766509	0.747874	GATv2Conv
# 20	10	0.521268	0.777070	0.720648	0.905852	0.802706	0.776906	SageConv
# 21	1	0.544140	0.738854	0.732673	0.753181	0.742785	0.738835	SageConv
# 22	2	0.573661	0.750318	0.718750	0.821429	0.766667	0.750409	SageConv
# 23	3	0.519175	0.749045	0.705263	0.854592	0.772780	0.749179	SageConv
# 24	4	0.581503	0.743949	0.710526	0.824427	0.763251	0.743846	SageConv
# 25	5	0.538134	0.735032	0.703297	0.814249	0.754717	0.734931	SageConv
# 26	6	0.504349	0.746497	0.718821	0.808673	0.761104	0.746576	SageConv
# 27	7	0.443641	0.749045	0.731591	0.785714	0.757688	0.749091	SageConv
# 28	8	0.519326	0.742675	0.715909	0.803571	0.757212	0.742753	SageConv
# 29	9	0.515803	0.735032	0.740260	0.725191	0.732648	0.735044	SageConv

In [28]:
import pandas as pd
from scipy.stats import shapiro, levene, f_oneway, kruskal
import statsmodels.stats.multicomp as mc

# Sample data for example
# Replace this with the actual data
# Ensemble Models Data

gnn_data = {
    'Model': [],
    'Accuracy': []
}


# Add GNN model accuracies
for model, df in {'large_gnn': large_gnn, 'small_2_1': small_2_1, 'small_2_2': small_2_2, 'small_2_3': small_2_3}.items():
    for index, row in df.iterrows():
        gnn_data['Model'].append(row['Model'])
        gnn_data['Accuracy'].append(row['Accuracy'])

df_gnn = pd.DataFrame(gnn_data)
df_gnn['Type'] = 'GNN'

In [29]:
df_gnn

Unnamed: 0,Model,Accuracy,Type
0,GatedGraphConv,0.873934,GNN
1,GatedGraphConv,0.857401,GNN
2,GatedGraphConv,0.860049,GNN
3,GatedGraphConv,0.876195,GNN
4,GatedGraphConv,0.878714,GNN
...,...,...,...
116,SageConv,0.735032,GNN
117,SageConv,0.746497,GNN
118,SageConv,0.749045,GNN
119,SageConv,0.742675,GNN


In [30]:
# save as csv
df_gnn.to_csv('gnn_data.csv', index=False)
