In [1]:
import os
import pandas as pd
import os
import pandas as pd
import numpy as np
from scipy import stats


# Set1 model large dataset RNA binders vs Protein binders

In [2]:
# df_gated = pd.read_csv('set1_ml_models/gatedgraphconv/test_results_GatedGraphConv.csv')
# df_gatv2 = pd.read_csv('set1_ml_models/gatv2conv/test_results_GATv2Conv.csv')
# df_sage = pd.read_csv('set1_ml_models/sageconv/test_results_SageConv.csv')


In [3]:
# Define the base directory where CSV files are located
base_dir = 'set1_ml_models'

# Models and the associated directories
models = {
    'GatedGraphConv': 'gatedgraphconv',
    'GATv2Conv': 'gatv2conv',
    'SageConv': 'sageconv'
}

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each model and their associated directory
for model, dir_name in models.items():
    # Construct the path to the CSV file
    csv_path = os.path.join(base_dir, dir_name, f'test_results_{model}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Add a column for the model
        df['Model'] = model
        # Append the DataFrame to the list
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)
# rename col F1-score to F1 score
all_data.rename(columns={'F1-Score': 'F1 Score'}, inplace=True)
all_data.rename(columns={'ROC-AUC': 'ROC AUC'}, inplace=True)
# Display the combined DataFrame
all_data

Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,3,0.293396,0.873934,0.874418,0.873289,0.873853,0.873934,GatedGraphConv
1,7,0.328256,0.857401,0.865136,0.84681,0.855875,0.857401,GatedGraphConv
2,1,0.318745,0.860049,0.865191,0.85301,0.859057,0.860049,GatedGraphConv
3,5,0.291239,0.876195,0.854362,0.907001,0.879895,0.876195,GatedGraphConv
4,4,0.285744,0.878714,0.854449,0.912942,0.882728,0.878714,GatedGraphConv
5,9,0.283506,0.881361,0.887416,0.873547,0.880427,0.881361,GatedGraphConv
6,2,0.3084,0.872255,0.88581,0.854689,0.869971,0.872255,GatedGraphConv
7,10,0.30482,0.877874,0.889911,0.862439,0.875959,0.877874,GatedGraphConv
8,8,0.300888,0.866249,0.890834,0.834797,0.861906,0.866249,GatedGraphConv
9,6,0.271804,0.888724,0.884601,0.894084,0.889317,0.888724,GatedGraphConv


In [4]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['Run ID']
model = best_model_row['Model']

print(f"Set1: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set1: Best GatedGraphConv with the highest accuracy is from run 6 with an accuracy of 0.8887238439679669.


In [17]:
# Models and metrics to analyze
models = ['GatedGraphConv', 'GATv2Conv', 'SageConv']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[ht]\n"
    "\\centering\n"
    "\\caption{RNA Binders vs Protein Binders (Merged Dataset)}\n"
    "\\label{tab:model_performance}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)


\begin{table}[ht]
\centering
\caption{RNA Binders vs Protein Binders (Merged Dataset)}
\label{tab:model_performance}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
GatedGraphConv & 71.17 $\pm$ 8.64\% & 68.37 $\pm$ 7.60\% & \textbf{83.13 $\pm$ 8.87\%} & 74.48 $\pm$ 5.23\% & 71.18 $\pm$ 8.63\% \\
GATv2Conv & 72.15 $\pm$ 4.32\% & 69.85 $\pm$ 3.84\% & 78.03 $\pm$ 5.48\% & 73.66 $\pm$ 4.17\% & 72.15 $\pm$ 4.32\% \\
SageConv & \textbf{74.68 $\pm$ 1.20\%} & \textbf{71.98 $\pm$ 1.21\%} & 80.97 $\pm$ 5.01\% & \textbf{76.12 $\pm$ 1.86\%} & \textbf{74.68 $\pm$ 1.20\%} \\\bottomrule
\end{tabularx}
\end{table}



# Set2 model small datasets model1 RNA binders RNA nonbinders (ROBIN)

In [18]:
# Define the base directory where CSV files are located
base_dir = 'set2_ml_models/model1_rna_b_nb'

# Models and the associated directories
models = {
    'GatedGraphConv': 'gatedgraphconv',
    'GATv2Conv': 'gatv2conv',
    'SageConv': 'sageconv'
}

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each model and their associated directory
for model, dir_name in models.items():
    # Construct the path to the CSV file
    csv_path = os.path.join(base_dir, dir_name, f'test_results_{model}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Add a column for the model
        df['Model'] = model
        # Append the DataFrame to the list
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)
# rename col F1-score to F1 score
all_data.rename(columns={'F1-Score': 'F1 Score'}, inplace=True)
all_data.rename(columns={'ROC-AUC': 'ROC AUC'}, inplace=True)
# Display the combined DataFrame
all_data

Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,10,0.652818,0.620382,0.648903,0.526718,0.581461,0.620502,GatedGraphConv
1,1,0.6384,0.629299,0.660377,0.534351,0.590717,0.62942,GatedGraphConv
2,2,0.64584,0.630573,0.626238,0.645408,0.635678,0.630592,GatedGraphConv
3,3,0.679409,0.564331,0.542088,0.821429,0.653144,0.564658,GatedGraphConv
4,4,0.660885,0.598726,0.584416,0.687023,0.631579,0.598613,GatedGraphConv
5,5,0.654,0.610191,0.593148,0.704835,0.644186,0.61007,GatedGraphConv
6,6,0.650367,0.596178,0.606232,0.545918,0.574497,0.596114,GatedGraphConv
7,7,0.646491,0.607643,0.603448,0.625,0.614035,0.607665,GatedGraphConv
8,8,0.65413,0.626752,0.660194,0.520408,0.582026,0.626616,GatedGraphConv
9,9,0.660795,0.614013,0.655052,0.479592,0.553756,0.613842,GatedGraphConv


In [19]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['Run ID']
model = best_model_row['Model']

print(f"Set1: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set1: Best SageConv with the highest accuracy is from run 2 with an accuracy of 0.6611464968152866.


In [20]:
# Models and metrics to analyze
models = ['GatedGraphConv', 'GATv2Conv', 'SageConv']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[H]\n"
    "\\centering\n"
    "\\caption{GNN models on RNA Binders vs RNA Non-Binders ($Set2\_Small$)}\n"
    "\\label{tab:model_performance2}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)


\begin{table}[H]
\centering
\caption{GNN models on RNA Binders vs RNA Non-Binders ($Set2\_Small$)}
\label{tab:model_performance}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
GatedGraphConv & 60.98 $\pm$ 2.00\% & 61.80 $\pm$ 3.92\% & 60.91 $\pm$ 10.69\% & 60.61 $\pm$ 3.40\% & 60.98 $\pm$ 2.00\% \\
GATv2Conv & 60.13 $\pm$ 2.92\% & 60.88 $\pm$ 5.71\% & \textbf{61.62 $\pm$ 8.74\%} & 60.51 $\pm$ 2.55\% & 60.13 $\pm$ 2.91\% \\
SageConv & \textbf{61.78 $\pm$ 2.32\%} & \textbf{62.70 $\pm$ 3.38\%} & 59.40 $\pm$ 6.98\% & \textbf{60.69 $\pm$ 3.31\%} & \textbf{61.78 $\pm$ 2.32\%} \\\bottomrule
\end{tabularx}
\end{table}



# Set2 model small dataset model2 RNA binder (ROBIN) vs Protein Binder (Probes & Drugs)

In [21]:
# Define the base directory where CSV files are located
base_dir = 'set2_ml_models/model2_rna_b_prot_b'

# Models and the associated directories
models = {
    'GatedGraphConv': 'gatedgraphconv',
    'GATv2Conv': 'gatv2conv',
    'SageConv': 'sageconv'
}

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each model and their associated directory
for model, dir_name in models.items():
    # Construct the path to the CSV file
    csv_path = os.path.join(base_dir, dir_name, f'test_results_{model}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Add a column for the model
        df['Model'] = model
        # Append the DataFrame to the list
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)
# rename col F1-score to F1 score
all_data.rename(columns={'F1-Score': 'F1 Score'}, inplace=True)
all_data.rename(columns={'ROC-AUC': 'ROC AUC'}, inplace=True)
# Display the combined DataFrame
all_data

Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,10,0.263752,0.907006,0.906091,0.908397,0.907243,0.907005,GatedGraphConv
1,1,0.277084,0.887898,0.880299,0.898219,0.889169,0.887885,GatedGraphConv
2,2,0.35352,0.848408,0.90991,0.772959,0.835862,0.848312,GatedGraphConv
3,3,0.357674,0.867516,0.911429,0.813776,0.859838,0.867448,GatedGraphConv
4,4,0.288136,0.900637,0.92,0.877863,0.898438,0.900666,GatedGraphConv
5,5,0.340909,0.872611,0.867168,0.880407,0.873737,0.872602,GatedGraphConv
6,6,0.441551,0.857325,0.880435,0.826531,0.852632,0.857286,GatedGraphConv
7,7,0.514383,0.727389,0.709906,0.767857,0.737745,0.72744,GatedGraphConv
8,8,0.323726,0.887898,0.870732,0.910714,0.890274,0.887927,GatedGraphConv
9,9,0.413499,0.821656,0.925676,0.69898,0.796512,0.8215,GatedGraphConv


In [22]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['Run ID']
model = best_model_row['Model']

print(f"Set1: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set1: Best GatedGraphConv with the highest accuracy is from run 10 with an accuracy of 0.9070063694267516.


In [23]:
# Models and metrics to analyze
models = ['GatedGraphConv', 'GATv2Conv', 'SageConv']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[H]\n"
    "\\centering\n"
    "\\caption{GNN models on RNA Binders vs Protein Binders ($Set2\_Small$)}\n"
    "\\label{tab:model_performance1}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)


\begin{table}[H]
\centering
\caption{GNN models on RNA Binders vs Protein Binders ($Set2\_Small$)}
\label{tab:model_performance1}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
GatedGraphConv & 85.78 $\pm$ 5.25\% & \textbf{87.82 $\pm$ 6.27\%} & 83.56 $\pm$ 7.18\% & 85.41 $\pm$ 5.26\% & 85.78 $\pm$ 5.25\% \\
GATv2Conv & 83.96 $\pm$ 3.74\% & 83.20 $\pm$ 4.67\% & 85.35 $\pm$ 3.65\% & 84.20 $\pm$ 3.53\% & 83.96 $\pm$ 3.74\% \\
SageConv & \textbf{87.19 $\pm$ 1.60\%} & 87.70 $\pm$ 2.02\% & \textbf{86.61 $\pm$ 3.70\%} & \textbf{87.09 $\pm$ 1.82\%} & \textbf{87.19 $\pm$ 1.60\%} \\\bottomrule
\end{tabularx}
\end{table}



# Set2 model small dataset model3 RNA Binder (ROBIN) vs Non-Binders (Merged small)

In [24]:
# Define the base directory where CSV files are located
base_dir = 'set2_ml_models/model3_binder_nonbinder'

# Models and the associated directories
models = {
    'GatedGraphConv': 'gatedgraphconv',
    'GATv2Conv': 'gatv2conv',
    'SageConv': 'sageconv'
}

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each model and their associated directory
for model, dir_name in models.items():
    # Construct the path to the CSV file
    csv_path = os.path.join(base_dir, dir_name, f'test_results_{model}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Add a column for the model
        df['Model'] = model
        # Append the DataFrame to the list
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)
# rename col F1-score to F1 score
all_data.rename(columns={'F1-Score': 'F1 Score'}, inplace=True)
all_data.rename(columns={'ROC-AUC': 'ROC AUC'}, inplace=True)
# Display the combined DataFrame
all_data

Unnamed: 0,Run ID,Test Loss,Accuracy,Precision,Recall,F1 Score,ROC AUC,Model
0,10,0.491645,0.764331,0.724138,0.854962,0.784131,0.764216,GatedGraphConv
1,1,0.57201,0.722293,0.702079,0.773537,0.736077,0.722228,GatedGraphConv
2,2,0.534359,0.765605,0.76943,0.757653,0.763496,0.765595,GatedGraphConv
3,3,0.538557,0.745223,0.713333,0.818878,0.76247,0.745317,GatedGraphConv
4,4,0.659731,0.616561,0.606977,0.664122,0.634265,0.6165,GatedGraphConv
5,5,0.518339,0.749045,0.708511,0.847328,0.771727,0.748919,GatedGraphConv
6,6,0.700063,0.499363,0.499363,1.0,0.6661,0.5,GatedGraphConv
7,7,0.471273,0.747771,0.708155,0.841837,0.769231,0.74789,GatedGraphConv
8,8,0.502531,0.750318,0.702479,0.867347,0.776256,0.750467,GatedGraphConv
9,9,0.523087,0.756688,0.70303,0.887755,0.784667,0.756855,GatedGraphConv


In [25]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['Run ID']
model = best_model_row['Model']

print(f"Set1: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set1: Best SageConv with the highest accuracy is from run 10 with an accuracy of 0.7770700636942676.


In [27]:
# Models and metrics to analyze
models = ['GatedGraphConv', 'GATv2Conv', 'SageConv']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[H]\n"
    "\\centering\n"
    "\\caption{GNN models on RNA-Binders vs Non-Binders ($Set2\_Small$)}\n"
    "\\label{tab:model_performance3}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)


\begin{table}[H]
\centering
\caption{GNN models on RNA-Binders vs Non-Binders ($Set2\_Small$)}
\label{tab:model_performance3}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
GatedGraphConv & 71.17 $\pm$ 8.64\% & 68.37 $\pm$ 7.60\% & \textbf{83.13 $\pm$ 8.87\%} & 74.48 $\pm$ 5.23\% & 71.18 $\pm$ 8.63\% \\
GATv2Conv & 72.15 $\pm$ 4.32\% & 69.85 $\pm$ 3.84\% & 78.03 $\pm$ 5.48\% & 73.66 $\pm$ 4.17\% & 72.15 $\pm$ 4.32\% \\
SageConv & \textbf{74.68 $\pm$ 1.20\%} & \textbf{71.98 $\pm$ 1.21\%} & 80.97 $\pm$ 5.01\% & \textbf{76.12 $\pm$ 1.86\%} & \textbf{74.68 $\pm$ 1.20\%} \\\bottomrule
\end{tabularx}
\end{table}

