# Run RFC per Tissue

In [52]:
import os
import torch
import pickle
import time
import pandas as pd
import torch.nn.functional as F
from omegaconf import OmegaConf, DictConfig
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, mean_squared_error, accuracy_score,
    precision_score, recall_score, f1_score
)

from utils.knowledge_db import CODON_MAP_DNA, TISSUES, TOKENS
from utils.utils import set_project_path, set_log_file, check_config
from data_handling.data_loader import RNADataset

TISSUES_DICT = {k: v for k, v in enumerate(TISSUES)}

In [59]:
from sklearn.neural_network import MLPClassifier

# Random Forest results
results_rf = []

# MLP results
results_mlp = []

for tissue_id in range(29):
    dev_config["tissue_id"] = tissue_id

    train_dataset = RNADataset(dev_config, tissue_id)
    val_dataset = RNADataset(dev_config, tissue_id, test=True)

    x_train = train_dataset.freqs
    y_train = train_dataset.targets_bin
    x_val = val_dataset.freqs
    y_val = val_dataset.targets_bin

    # --- Random Forest ---
    rf_model = RandomForestClassifier(random_state=42)
    start_time = time.time()
    rf_model.fit(x_train, y_train)
    rf_time = time.time() - start_time

    y_train_rf_proba = rf_model.predict_proba(x_train)[:, 1]
    y_train_rf_pred = (y_train_rf_proba >= 0.5).astype(int)
    y_val_rf_proba = rf_model.predict_proba(x_val)[:, 1]
    y_val_rf_pred = (y_val_rf_proba >= 0.5).astype(int)

    results_rf.append({
        "Tissue": TISSUES_DICT[tissue_id],
        "Train AUC": roc_auc_score(y_train, y_train_rf_proba),
        "Train F1": f1_score(y_train, y_train_rf_pred, zero_division=0),
        "Train Accuracy": accuracy_score(y_train, y_train_rf_pred),
        "Train Precision": precision_score(y_train, y_train_rf_pred, zero_division=0),
        "Train Recall": recall_score(y_train, y_train_rf_pred, zero_division=0),
        "Test AUC": roc_auc_score(y_val, y_val_rf_proba),
        "Test F1": f1_score(y_val, y_val_rf_pred, zero_division=0),
        "Test Accuracy": accuracy_score(y_val, y_val_rf_pred),
        "Test Precision": precision_score(y_val, y_val_rf_pred, zero_division=0),
        "Test Recall": recall_score(y_val, y_val_rf_pred, zero_division=0),
        "Train Time (s)": rf_time
    })

    # --- MLP ---
    mlp_model = MLPClassifier(hidden_layer_sizes=(512), activation='relu', max_iter=500, random_state=42)
    start_time = time.time()
    mlp_model.fit(x_train, y_train)
    mlp_time = time.time() - start_time

    y_train_mlp_proba = mlp_model.predict_proba(x_train)[:, 1]
    y_train_mlp_pred = (y_train_mlp_proba >= 0.5).astype(int)
    y_val_mlp_proba = mlp_model.predict_proba(x_val)[:, 1]
    y_val_mlp_pred = (y_val_mlp_proba >= 0.5).astype(int)

    results_mlp.append({
        "Tissue": TISSUES_DICT[tissue_id],
        "Train AUC": roc_auc_score(y_train, y_train_mlp_proba),
        "Train F1": f1_score(y_train, y_train_mlp_pred, zero_division=0),
        "Train Accuracy": accuracy_score(y_train, y_train_mlp_pred),
        "Train Precision": precision_score(y_train, y_train_mlp_pred, zero_division=0),
        "Train Recall": recall_score(y_train, y_train_mlp_pred, zero_division=0),
        "Test AUC": roc_auc_score(y_val, y_val_mlp_proba),
        "Test F1": f1_score(y_val, y_val_mlp_pred, zero_division=0),
        "Test Accuracy": accuracy_score(y_val, y_val_mlp_pred),
        "Test Precision": precision_score(y_val, y_val_mlp_pred, zero_division=0),
        "Test Recall": recall_score(y_val, y_val_mlp_pred, zero_division=0),
        "Train Time (s)": mlp_time
    })

# Convert results to DataFrames
df_results_rf = pd.DataFrame(results_rf)
df_results_mlp = pd.DataFrame(results_mlp)

2025-06-20 13:24:25,891 - root - INFO - Loading train data from: train_9.0k_data.pkl
2025-06-20 13:24:42,374 - root - INFO - Loading val data from: val_9.0k_data.pkl
2025-06-20 13:24:45,793 - root - INFO - Adding validation dataset with 72 samples to train
2025-06-20 13:24:45,795 - root - INFO - Train dataset with 330 samples loaded
2025-06-20 13:24:45,796 - root - INFO - # Stats of dataset after filtering
2025-06-20 13:24:45,800 - root - INFO -   Class distribution:
tissue_id  targets_bin
0          0              145
           1              185
Name: targets, dtype: int64
2025-06-20 13:24:45,802 - root - INFO -   Seq len distribution:
(array([36, 61, 46, 59, 40, 31, 19, 14, 17,  7]), array([ 496. , 1343.5, 2191. , 3038.5, 3886. , 4733.5, 5581. , 6428.5,
       7276. , 8123.5, 8971. ]))
2025-06-20 13:24:46,145 - root - INFO - Loading test data from: test_9.0k_data.pkl
2025-06-20 13:24:49,698 - root - INFO - Test dataset with 47 samples loaded
2025-06-20 13:24:49,704 - root - INFO - 

In [60]:
df_results_rf

Unnamed: 0,Tissue,Train AUC,Train F1,Train Accuracy,Train Precision,Train Recall,Test AUC,Test F1,Test Accuracy,Test Precision,Test Recall,Train Time (s)
0,Adrenal,1.0,1.0,1.0,1.0,1.0,0.462862,0.590164,0.468085,0.473684,0.782609,0.30421
1,Appendices,1.0,1.0,1.0,1.0,1.0,0.567808,0.045455,0.681818,1.0,0.023256,0.552087
2,Brain,1.0,1.0,1.0,1.0,1.0,0.543047,0.808081,0.680672,0.683761,0.987654,0.737857
3,Colon,1.0,1.0,1.0,1.0,1.0,0.789911,0.704225,0.752941,0.833333,0.609756,0.386968
4,Duodenum,1.0,1.0,1.0,1.0,1.0,0.565882,0.372093,0.542373,0.444444,0.32,0.306681
5,Uterus,1.0,1.0,1.0,1.0,1.0,0.581178,0.724638,0.641509,0.625,0.862069,0.260196
6,Esophagus,1.0,1.0,1.0,1.0,1.0,0.477583,0.093023,0.566667,0.166667,0.064516,0.397382
7,Fallopiantube,1.0,1.0,1.0,1.0,1.0,0.627539,0.769231,0.662921,0.735294,0.806452,0.353287
8,Fat,1.0,1.0,1.0,1.0,1.0,0.573465,0.093023,0.732877,0.4,0.052632,0.711632
9,Gallbladder,1.0,1.0,1.0,1.0,1.0,0.608182,0.434783,0.623188,0.47619,0.4,0.311719


In [61]:
df_results_mlp

Unnamed: 0,Tissue,Train AUC,Train F1,Train Accuracy,Train Precision,Train Recall,Test AUC,Test F1,Test Accuracy,Test Precision,Test Recall,Train Time (s)
0,Adrenal,0.78822,0.763819,0.715152,0.713615,0.821622,0.539855,0.618182,0.553191,0.53125,0.73913,2.699573
1,Appendices,0.712819,0.264706,0.717115,0.654545,0.165899,0.559185,0.0,0.651515,0.0,0.0,4.276603
2,Brain,0.704905,0.841148,0.736964,0.742892,0.969355,0.67024,0.806283,0.689076,0.7,0.950617,5.518711
3,Colon,0.832051,0.712531,0.753684,0.783784,0.653153,0.746674,0.65,0.670588,0.666667,0.634146,3.881353
4,Duodenum,0.757527,0.654321,0.686275,0.679487,0.630952,0.509412,0.382979,0.508475,0.409091,0.36,2.756858
5,Uterus,0.812236,0.81268,0.758364,0.746032,0.892405,0.568966,0.666667,0.584906,0.594595,0.758621,2.361644
6,Esophagus,0.760746,0.513889,0.718876,0.698113,0.406593,0.43521,0.208333,0.577778,0.294118,0.16129,3.956932
7,Fallopiantube,0.78568,0.794224,0.734884,0.745763,0.849421,0.646356,0.745763,0.662921,0.785714,0.709677,3.339609
8,Fat,0.731447,0.214286,0.748731,0.692308,0.126761,0.52729,0.046512,0.719178,0.2,0.026316,5.921046
9,Gallbladder,0.774634,0.635379,0.715493,0.692913,0.586667,0.505455,0.423077,0.565217,0.407407,0.44,2.752788


In [62]:
df_results_rf["Test AUC"].mean()

0.605856476720072

In [63]:
df_results_mlp["Test AUC"].mean()

0.5932103863181245

In [49]:
df_results.to_csv("visualisation/data/rfc_per_tissue_scores.csv")