In [1]:
import os
import pandas as pd
import os
import pandas as pd
import numpy as np
from scipy import stats
# import joblib
import joblib


In [2]:
# Define the base directory where CSV files are located
base_dir = '/home/ubuntu/diplomka/notebooks_ipynb/a_results_ml_set1/ml_output/output'

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each sample directory
for i in range(1, 11):  # Assuming there are 10 samples as mentioned
    # Format the directory name and path to the CSV
    dir_name = f'sample_{i:03d}'
    csv_path = os.path.join(base_dir, dir_name, 'results', f'sorted_metrics_df_{i:03d}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Append the DataFrame to the list with an additional column for the run identifier
        df['run_id'] = i
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)

# Models and metrics to analyze
models = ['XGBoost', 'LightGBM', 'Random Forest']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[ht]\n"
    "\\centering\n"
    "\\caption{RNA Binders vs Protein Binders (Merged Dataset)}\n"
    "\\label{tab:model_performance}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)



\begin{table}[ht]
\centering
\caption{RNA Binders vs Protein Binders (Merged Dataset)}
\label{tab:model_performance}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
XGBoost & \textbf{88.47 $\pm$ 0.32\%} & \textbf{89.11 $\pm$ 0.39\%} & 87.64 $\pm$ 0.50\% & \textbf{88.37 $\pm$ 0.34\%} & \textbf{95.38 $\pm$ 0.21\%} \\
LightGBM & 88.36 $\pm$ 0.32\% & 88.81 $\pm$ 0.34\% & \textbf{87.79 $\pm$ 0.58\%} & 88.30 $\pm$ 0.34\% & 95.26 $\pm$ 0.26\% \\
Random Forest & 86.61 $\pm$ 0.25\% & 88.12 $\pm$ 0.52\% & 84.63 $\pm$ 0.80\% & 86.34 $\pm$ 0.30\% & 94.15 $\pm$ 0.20\% \\\bottomrule
\end{tabularx}
\end{table}



In [3]:
all_data

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,run_id
0,LightGBM,0.886657,0.891768,0.880134,0.885913,0.952861,1
1,XGBoost,0.885688,0.893205,0.87613,0.884585,0.95419,1
2,Random Forest,0.868316,0.888859,0.841901,0.864743,0.942123,1
3,XGBoost,0.888659,0.894158,0.881684,0.887877,0.955541,2
4,LightGBM,0.884784,0.889208,0.879101,0.884126,0.953526,2
5,Random Forest,0.869478,0.88713,0.84668,0.866433,0.942562,2
6,XGBoost,0.882201,0.889035,0.873418,0.881157,0.950897,3
7,LightGBM,0.879036,0.889967,0.865022,0.877317,0.947109,3
8,Random Forest,0.861405,0.883078,0.833118,0.857371,0.938457,3
9,LightGBM,0.887755,0.892418,0.881813,0.887084,0.955327,4


In [4]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['run_id']
model = best_model_row['Model']

print(f"Set1: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set1: Best XGBoost with the highest accuracy is from run 8 with an accuracy of 0.8904675794368381.


In [5]:
# load the best model and provide the model parameters
# Load the best model
best_model_dir = os.path.join(base_dir, f'sample_{best_model_run_id:03d}', 'models/')
best_model_path = os.path.join(best_model_dir, f'best_xgb.joblib')

# Load the model
best_model = joblib.load(best_model_path)

# Extract the model parameters
model_params = best_model.get_params()

# Print the model parameters
print(f"Set1: Best {model} model parameters:")
for param, value in model_params.items():
    print(f"{param}: {value}")
    

Set1: Best XGBoost model parameters:
objective: binary:logistic
base_score: None
booster: None
callbacks: None
colsample_bylevel: None
colsample_bynode: None
colsample_bytree: 0.5080878720498611
device: None
early_stopping_rounds: None
enable_categorical: False
eval_metric: logloss
feature_types: None
gamma: 0.029686411895295084
grow_policy: None
importance_type: None
interaction_constraints: None
learning_rate: 0.23041030623581468
max_bin: None
max_cat_threshold: None
max_cat_to_onehot: None
max_delta_step: None
max_depth: 9
max_leaves: None
min_child_weight: 1
missing: nan
monotone_constraints: None
multi_strategy: None
n_estimators: 616
n_jobs: None
num_parallel_tree: None
random_state: 888
reg_alpha: 0.0834895743821972
reg_lambda: 0.01085950121847113
sampling_method: None
scale_pos_weight: None
subsample: 0.8946189513327496
tree_method: None
validate_parameters: None
verbosity: None
use_label_encoder: False


In [6]:
import os
import pandas as pd
import numpy as np

# Define the base directory where CSV files are located
base_dir = '/home/ubuntu/diplomka/notebooks_ipynb/ml/model1_rna_b_rna_n/output'

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each sample directory
for i in range(1, 11):  # Assuming there are 10 samples as mentioned
    # Format the directory name and path to the CSV
    dir_name = f'sample_{i:03d}'
    csv_path = os.path.join(base_dir, dir_name, 'results', f'sorted_metrics_df_{i:03d}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Append the DataFrame to the list with an additional column for the run identifier
        df['run_id'] = i
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)

# Models and metrics to analyze
models = ['XGBoost', 'LightGBM', 'Random Forest']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[ht]\n"
    "\\centering\n"
    "\\caption{RNA Binders vs RNA Non-Binders (ROBIN)}\n"
    "\\label{tab:model_performance}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)



\begin{table}[ht]
\centering
\caption{RNA Binders vs RNA Non-Binders (ROBIN)}
\label{tab:model_performance}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
XGBoost & 60.64 $\pm$ 0.92\% & 61.53 $\pm$ 1.04\% & 56.51 $\pm$ 1.04\% & 58.91 $\pm$ 0.96\% & 65.49 $\pm$ 0.68\% \\
LightGBM & 59.83 $\pm$ 1.28\% & 60.60 $\pm$ 1.50\% & 55.99 $\pm$ 1.08\% & 58.20 $\pm$ 1.19\% & 64.05 $\pm$ 1.32\% \\
Random Forest & \textbf{61.90 $\pm$ 0.58\%} & \textbf{63.24 $\pm$ 0.71\%} & \textbf{56.61 $\pm$ 1.17\%} & \textbf{59.73 $\pm$ 0.75\%} & \textbf{66.90 $\pm$ 0.25\%} \\\bottomrule
\end{tabularx}
\end{table}



In [7]:
all_data

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,run_id
0,Random Forest,0.626752,0.638655,0.581633,0.608812,0.669795,1
1,LightGBM,0.597452,0.603825,0.563776,0.583113,0.630001,1
2,XGBoost,0.589809,0.598315,0.543367,0.569519,0.650556,1
3,LightGBM,0.617834,0.628492,0.57398,0.6,0.657728,2
4,Random Forest,0.615287,0.627119,0.566327,0.595174,0.665959,2
5,XGBoost,0.606369,0.612466,0.576531,0.593955,0.650075,2
6,Random Forest,0.625478,0.64,0.571429,0.603774,0.673236,3
7,XGBoost,0.614013,0.62465,0.568878,0.595461,0.650945,3
8,LightGBM,0.592357,0.601695,0.543367,0.571046,0.624721,3
9,XGBoost,0.611465,0.620499,0.571429,0.594954,0.66385,4


In [8]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['run_id']
model = best_model_row['Model']

print(f"Set2 model1: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set2 model1: Best Random Forest with the highest accuracy is from run 1 with an accuracy of 0.6267515923566879.


In [9]:
best_model_dir = os.path.join(base_dir, f'sample_{best_model_run_id:03d}', 'models/')
best_model_path = os.path.join(best_model_dir, f'best_xgb.joblib')

# Load the model
best_model = joblib.load(best_model_path)

# Extract the model parameters
model_params = best_model.get_params()

# Print the model parameters
print(f"Set1: Best {model} model parameters:")
for param, value in model_params.items():
    print(f"{param}: {value}")

Set1: Best Random Forest model parameters:
objective: binary:logistic
base_score: None
booster: None
callbacks: None
colsample_bylevel: None
colsample_bynode: None
colsample_bytree: 0.7070250833551986
device: None
early_stopping_rounds: None
enable_categorical: False
eval_metric: logloss
feature_types: None
gamma: 3.058769756211789
grow_policy: None
importance_type: None
interaction_constraints: None
learning_rate: 0.13470618568464995
max_bin: None
max_cat_threshold: None
max_cat_to_onehot: None
max_delta_step: None
max_depth: 8
max_leaves: None
min_child_weight: 6
missing: nan
monotone_constraints: None
multi_strategy: None
n_estimators: 226
n_jobs: None
num_parallel_tree: None
random_state: 42
reg_alpha: 0.0106775032295119
reg_lambda: 2.143538894750219
sampling_method: None
scale_pos_weight: None
subsample: 0.7995046659482125
tree_method: None
validate_parameters: None
verbosity: None
use_label_encoder: False


In [10]:
import os
import pandas as pd
import numpy as np

# Define the base directory where CSV files are located
base_dir = '/home/ubuntu/diplomka/notebooks_ipynb/ml/model2_rna_prot_bin/output'

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each sample directory
for i in range(1, 11):  # Assuming there are 10 samples as mentioned
    # Format the directory name and path to the CSV
    dir_name = f'sample_{i:03d}'
    csv_path = os.path.join(base_dir, dir_name, 'results', f'sorted_metrics_df_{i:03d}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Append the DataFrame to the list with an additional column for the run identifier
        df['run_id'] = i
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)

# Models and metrics to analyze
models = ['XGBoost', 'LightGBM', 'Random Forest']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[ht]\n"
    "\\centering\n"
    "\\caption{RNA-Binders vs Probes & Drugs (Protein-Binders)}\n"
    "\\label{tab:model_performance}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)



\begin{table}[ht]
\centering
\caption{RNA-Binders vs Probes & Drugs (Protein-Binders)}
\label{tab:model_performance}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
XGBoost & 87.12 $\pm$ 0.42\% & 88.23 $\pm$ 0.49\% & 85.64 $\pm$ 0.66\% & 86.91 $\pm$ 0.44\% & 94.41 $\pm$ 0.24\% \\
LightGBM & \textbf{87.69 $\pm$ 0.60\%} & \textbf{88.65 $\pm$ 0.70\%} & 86.43 $\pm$ 0.81\% & \textbf{87.52 $\pm$ 0.61\%} & 94.47 $\pm$ 0.27\% \\
Random Forest & 86.34 $\pm$ 0.45\% & 85.78 $\pm$ 0.51\% & \textbf{87.09 $\pm$ 0.51\%} & 86.43 $\pm$ 0.44\% & \textbf{94.75 $\pm$ 0.16\%} \\\bottomrule
\end{tabularx}
\end{table}



In [11]:
all_data

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,run_id
0,LightGBM,0.884076,0.886889,0.880102,0.883483,0.949823,1
1,XGBoost,0.864968,0.874346,0.852041,0.863049,0.941372,1
2,Random Forest,0.859873,0.854271,0.867347,0.860759,0.949875,1
3,LightGBM,0.878981,0.887728,0.867347,0.877419,0.946143,2
4,XGBoost,0.870064,0.879581,0.857143,0.868217,0.944955,2
5,Random Forest,0.864968,0.8575,0.875,0.866162,0.950005,2
6,LightGBM,0.878981,0.891821,0.862245,0.876783,0.943624,3
7,Random Forest,0.870064,0.8625,0.880102,0.871212,0.946523,3
8,XGBoost,0.867516,0.878947,0.852041,0.865285,0.942346,3
9,XGBoost,0.871338,0.879896,0.859694,0.869677,0.944579,4


In [12]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['run_id']
model = best_model_row['Model']

print(f"Set2 model2: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set2 model2: Best LightGBM with the highest accuracy is from run 1 with an accuracy of 0.8840764331210191.


In [13]:
best_model_dir = os.path.join(base_dir, f'sample_{best_model_run_id:03d}', 'models/')
best_model_path = os.path.join(best_model_dir, f'best_xgb.joblib')

# Load the model
best_model = joblib.load(best_model_path)

# Extract the model parameters
model_params = best_model.get_params()

# Print the model parameters
print(f"Set1: Best {model} model parameters:")
for param, value in model_params.items():
    print(f"{param}: {value}")

Set1: Best LightGBM model parameters:
objective: binary:logistic
base_score: None
booster: None
callbacks: None
colsample_bylevel: None
colsample_bynode: None
colsample_bytree: 0.6587520309276079
device: None
early_stopping_rounds: None
enable_categorical: False
eval_metric: logloss
feature_types: None
gamma: 0.8458819434232958
grow_policy: None
importance_type: None
interaction_constraints: None
learning_rate: 0.08536385335889851
max_bin: None
max_cat_threshold: None
max_cat_to_onehot: None
max_delta_step: None
max_depth: 9
max_leaves: None
min_child_weight: 1
missing: nan
monotone_constraints: None
multi_strategy: None
n_estimators: 425
n_jobs: None
num_parallel_tree: None
random_state: 42
reg_alpha: 0.48802394866358306
reg_lambda: 3.8034022960292613
sampling_method: None
scale_pos_weight: None
subsample: 0.8160143522695071
tree_method: None
validate_parameters: None
verbosity: None
use_label_encoder: False


In [14]:
import os
import pandas as pd
import numpy as np

# Define the base directory where CSV files are located
base_dir = '/home/ubuntu/diplomka/notebooks_ipynb/ml/model3_binder_nonbinder/output'

# Prepare to collect all DataFrame objects
dataframes = []

# Iterate over each sample directory
for i in range(1, 11):  # Assuming there are 10 samples as mentioned
    # Format the directory name and path to the CSV
    dir_name = f'sample_{i:03d}'
    csv_path = os.path.join(base_dir, dir_name, 'results', f'sorted_metrics_df_{i:03d}.csv')
    
    # Check if the file exists
    if os.path.exists(csv_path):
        # Load the CSV file
        df = pd.read_csv(csv_path)
        # Append the DataFrame to the list with an additional column for the run identifier
        df['run_id'] = i
        dataframes.append(df)
    else:
        print(f"File not found: {csv_path}")

# Concatenate all DataFrames into a single DataFrame
all_data = pd.concat(dataframes, ignore_index=True)

# Models and metrics to analyze
models = ['XGBoost', 'LightGBM', 'Random Forest']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']

results = []

# Calculate metrics for each model
for model in models:
    model_data = all_data[all_data['Model'] == model]
    result_row = {'Model': model}
    for metric in metrics:
        mean = model_data[metric].mean() * 100  # Convert to percentage
        std = model_data[metric].std() * 100  # Convert to percentage
        # Format as mean ± std%
        result_row[metric] = f"{mean:.2f} ± {std:.2f}%"
    results.append(result_row)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Identify the best scores and bold them
for metric in metrics:
    best_score = max(results_df[metric], key=lambda x: float(x.split(' ± ')[0].replace('%','')))
    results_df[metric] = results_df[metric].apply(
        lambda x: f"\\textbf{{{x}}}" if x == best_score else x
    )

# Prepare the DataFrame to LaTeX table contents without adding LaTeX table commands
# Escape LaTeX special characters
results_df.replace('%', r'\%', regex=True, inplace=True)
table_contents = results_df.to_latex(index=False, escape=False, header=False)

# Strip unwanted LaTeX commands from the table contents
table_contents = table_contents.replace("\\begin{tabular}{llllll}", "") \
                               .replace("\\toprule", "") \
                               .replace("\\midrule", "") \
                               .replace("\\bottomrule", "") \
                               .replace("\\end{tabular}", "")\
                               .replace("±", "$\pm$")

# Manually construct the LaTeX table using tabularx to fit the width of the text
latex_table = (
    "\\begin{table}[ht]\n"
    "\\centering\n"
    "\\caption{RNA-Binders vs Non-Binders}\n"
    "\\label{tab:model_performance}\n"
    "\\scriptsize\n"
    "\\begin{tabularx}{\\textwidth}{Xccccc}\n"  # Changed 'l' to 'X' for the first column
    "\\toprule\n"
    "Model & " + " & ".join(metrics) + " \\\\\n"
    "\\midrule\n"
    + table_contents.strip() +  # Remove leading/trailing whitespace
    "\\bottomrule\n"
    "\\end{tabularx}\n"
    "\\end{table}\n"
)

print(latex_table)


\begin{table}[ht]
\centering
\caption{RNA-Binders vs Non-Binders}
\label{tab:model_performance}
\scriptsize
\begin{tabularx}{\textwidth}{Xccccc}
\toprule
Model & Accuracy & Precision & Recall & F1 Score & ROC AUC \\
\midrule
XGBoost & 72.65 $\pm$ 0.61\% & 71.76 $\pm$ 0.71\% & \textbf{74.59 $\pm$ 0.89\%} & 73.15 $\pm$ 0.60\% & \textbf{80.05 $\pm$ 0.28\%} \\
LightGBM & 71.82 $\pm$ 0.88\% & 70.80 $\pm$ 0.98\% & 74.18 $\pm$ 1.54\% & 72.44 $\pm$ 0.91\% & 79.25 $\pm$ 0.83\% \\
Random Forest & \textbf{73.59 $\pm$ 0.89\%} & \textbf{73.23 $\pm$ 0.52\%} & 74.26 $\pm$ 1.88\% & \textbf{73.73 $\pm$ 1.14\%} & 79.96 $\pm$ 0.27\% \\\bottomrule
\end{tabularx}
\end{table}



In [15]:
all_data

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC AUC,run_id
0,XGBoost,0.724841,0.72335,0.727041,0.725191,0.80326,1
1,LightGBM,0.722293,0.710145,0.75,0.729529,0.791913,1
2,Random Forest,0.721019,0.724675,0.711735,0.718147,0.800254,1
3,Random Forest,0.749045,0.738386,0.770408,0.754057,0.801705,2
4,XGBoost,0.724841,0.715686,0.744898,0.73,0.795964,2
5,LightGBM,0.718471,0.71867,0.716837,0.717752,0.803364,2
6,Random Forest,0.732484,0.7275,0.742347,0.734848,0.798697,3
7,XGBoost,0.719745,0.711823,0.737245,0.724311,0.797424,3
8,LightGBM,0.717197,0.70936,0.734694,0.721805,0.792056,3
9,Random Forest,0.742675,0.736318,0.755102,0.745592,0.803013,4


In [16]:
# Assuming 'accuracy' is the column name that contains accuracy scores
# Find the row with the maximum accuracy
best_model_row = all_data.loc[all_data['Accuracy'].idxmax()]

# Extract information about the best model
best_accuracy = best_model_row['Accuracy']
best_model_run_id = best_model_row['run_id']
model = best_model_row['Model']

print(f"Set2 model3: Best {model} with the highest accuracy is from run {best_model_run_id} with an accuracy of {best_accuracy}.")


Set2 model3: Best Random Forest with the highest accuracy is from run 2 with an accuracy of 0.7490445859872611.


In [17]:
best_model_dir = os.path.join(base_dir, f'sample_{best_model_run_id:03d}', 'models/')
best_model_path = os.path.join(best_model_dir, f'best_xgb.joblib')

# Load the model
best_model = joblib.load(best_model_path)

# Extract the model parameters
model_params = best_model.get_params()

# Print the model parameters
print(f"Set1: Best {model} model parameters:")
for param, value in model_params.items():
    print(f"{param}: {value}")

Set1: Best Random Forest model parameters:
objective: binary:logistic
base_score: None
booster: None
callbacks: None
colsample_bylevel: None
colsample_bynode: None
colsample_bytree: 0.6276078668490618
device: None
early_stopping_rounds: None
enable_categorical: False
eval_metric: logloss
feature_types: None
gamma: 2.130304930228856
grow_policy: None
importance_type: None
interaction_constraints: None
learning_rate: 0.03705547031848796
max_bin: None
max_cat_threshold: None
max_cat_to_onehot: None
max_delta_step: None
max_depth: 8
max_leaves: None
min_child_weight: 1
missing: nan
monotone_constraints: None
multi_strategy: None
n_estimators: 165
n_jobs: None
num_parallel_tree: None
random_state: 420
reg_alpha: 0.07485896891098083
reg_lambda: 1.2648674244551141
sampling_method: None
scale_pos_weight: None
subsample: 0.6349367806676871
tree_method: None
validate_parameters: None
verbosity: None
use_label_encoder: False
