In [2]:
import pandas as pd
import numpy as np

In [11]:
easy_path = "chem_map_all_easy_preds_enriched.csv"
hard_path = "chem_map_all_hard_preds_enriched.csv"

easy_df = pd.read_csv(easy_path)
hard_df = pd.read_csv(hard_path)

easy_df["label"] = 1
hard_df["label"] = 0

df = pd.concat([easy_df, hard_df], ignore_index=True)

df.head()

Unnamed: 0,dataset,datapoint,sequence,model,prediction,score,sequence_length,gc_content,sequence_entropy,mfe,...,rate_of_bps_predicted,hairpin_count,junction_count,helix_count,singlestrand_count,mway_junction_count,AU_pairs_in_helix_terminal_ends,helices_with_reverse_complement,hairpins_with_gt4_unpaired_nts,label
0,EternaData,ETERNA_R00_0000_ANNOTATION_1540,GGAAAAAAGGGUUGAUACGAUCGCUUGAUCCUGAAGGAAGCUUCAG...,ContextFold,..........(((((((....((((((...((((((....))))))...,0.953595,107,0.411215,0.953882,-36.9,...,0.485981,2,2,4,3,0,0.5,0.75,1.0,1
1,EternaData,ETERNA_R00_0000_ANNOTATION_1540,GGAAAAAAGGGUUGAUACGAUCGCUUGAUCCUGAAGGAAGCUUCAG...,ContraFold,..........((((((((((.((((((.(.((((((....))))))...,0.960087,107,0.411215,0.953882,-36.9,...,0.560748,2,4,6,3,0,0.5,0.833333,1.0,1
2,EternaData,ETERNA_R00_0000_ANNOTATION_1540,GGAAAAAAGGGUUGAUACGAUCGCUUGAUCCUGAAGGAAGCUUCAG...,EternaFold,..........((((((((((.((((((.(.((((((....))))))...,0.960087,107,0.411215,0.953882,-36.9,...,0.560748,2,4,6,3,0,0.5,0.833333,1.0,1
3,EternaData,ETERNA_R00_0000_ANNOTATION_1540,GGAAAAAAGGGUUGAUACGAUCGCUUGAUCCUGAAGGAAGCUUCAG...,IPKnot,..........((((((((...((((((...((((((....))))))...,0.963431,107,0.411215,0.953882,-36.9,...,0.504673,2,2,4,3,0,0.25,0.75,1.0,1
4,EternaData,ETERNA_R00_0000_ANNOTATION_1540,GGAAAAAAGGGUUGAUACGAUCGCUUGAUCCUGAAGGAAGCUUCAG...,MXFold,(.........((((((((...((((((...((((((....))))))...,0.948925,107,0.411215,0.953882,-36.9,...,0.523364,2,3,5,0,1,0.2,0.8,1.0,1


In [16]:
feature_cols = [
    # "score", # Not sure I want this
    "sequence_length",
    "gc_content",
    "sequence_entropy",
    "mfe",
    "ens_def",
    "longest_sequential_A",
    "longest_sequential_C",
    "longest_sequential_U",
    "longest_sequential_G",
    "longest_GC_helix",
    "GU_pairs",
    "rate_of_bps_predicted",
    "hairpin_count",
    "junction_count",
    "helix_count",
    "singlestrand_count",
    "mway_junction_count",
    "AU_pairs_in_helix_terminal_ends",
    "helices_with_reverse_complement",
    "hairpins_with_gt4_unpaired_nts",
]

for fc in feature_cols:
    if fc not in df.columns:
        print(f"{fc} not in feature columns")

In [17]:
# Ensure labels are 0 or 1
df["label"] = df["label"].astype(int)
assert set(df["label"].unique()) <= {0, 1}, "Labels must be 0/1 only."

In [18]:
const_cols = []
for col in feature_cols:
    if df[col].nunique() <= 1:
        const_cols.append(col)

if const_cols:
    print("Dropping constant feature columns:", const_cols)
    feature_cols = [c for c in feature_cols if c not in const_cols]

X = df[feature_cols].values.astype(np.float32)
y = df["label"].values.astype(np.int64)

# Quick sanity check
print("Any NaN in X?", np.isnan(X).any())
print("Any inf in X?", np.isinf(X).any())

Any NaN in X? False
Any inf in X? False


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Double-check after scaling
print("Any NaN in X_train_scaled?", np.isnan(X_train_scaled).any())
print("Any inf in X_train_scaled?", np.isinf(X_train_scaled).any())

Any NaN in X_train_scaled? False
Any inf in X_train_scaled? False


In [21]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


class RNAPredictionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).long()

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_dataset = RNAPredictionDataset(X_train_scaled, y_train)
val_dataset   = RNAPredictionDataset(X_val_scaled, y_val)
test_dataset  = RNAPredictionDataset(X_test_scaled, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=256, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [22]:
class QualityClassifier(nn.Module):
    def __init__(self, in_features: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),   # single logit
        )

    def forward(self, x):
        return self.net(x).squeeze(-1)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = QualityClassifier(in_features=len(feature_cols)).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [23]:
def run_epoch(loader, model, optimizer=None):
    if optimizer is None:
        model.eval()
    else:
        model.train()

    total_loss = 0.0
    total_correct = 0
    total_examples = 0

    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device).float()  # must be float for BCEWithLogitsLoss

        logits = model(X_batch)
        loss = criterion(logits, y_batch)

        if optimizer is not None:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += float(loss.item()) * X_batch.size(0)

        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).long()
        total_correct += (preds == y_batch.long()).sum().item()
        total_examples += X_batch.size(0)

    avg_loss = total_loss / total_examples
    avg_acc  = total_correct / total_examples
    return avg_loss, avg_acc

In [24]:
n_epochs = 20

for epoch in range(1, n_epochs + 1):
    train_loss, train_acc = run_epoch(train_loader, model, optimizer)
    val_loss, val_acc     = run_epoch(val_loader, model, optimizer=None)

    print(
        f"Epoch {epoch:02d} | "
        f"train loss: {train_loss:.4f}, train acc: {train_acc:.3f} | "
        f"val loss: {val_loss:.4f}, val acc: {val_acc:.3f}"
    )

Epoch 01 | train loss: 0.2468, train acc: 0.893 | val loss: 0.1627, val acc: 0.936
Epoch 02 | train loss: 0.1462, train acc: 0.945 | val loss: 0.1298, val acc: 0.951
Epoch 03 | train loss: 0.1193, train acc: 0.956 | val loss: 0.1191, val acc: 0.953
Epoch 04 | train loss: 0.1036, train acc: 0.963 | val loss: 0.1026, val acc: 0.965
Epoch 05 | train loss: 0.0937, train acc: 0.967 | val loss: 0.0956, val acc: 0.967
Epoch 06 | train loss: 0.0866, train acc: 0.970 | val loss: 0.0907, val acc: 0.969
Epoch 07 | train loss: 0.0812, train acc: 0.972 | val loss: 0.0870, val acc: 0.971
Epoch 08 | train loss: 0.0753, train acc: 0.973 | val loss: 0.0815, val acc: 0.974
Epoch 09 | train loss: 0.0710, train acc: 0.976 | val loss: 0.0782, val acc: 0.974
Epoch 10 | train loss: 0.0679, train acc: 0.976 | val loss: 0.0745, val acc: 0.977
Epoch 11 | train loss: 0.0648, train acc: 0.977 | val loss: 0.0744, val acc: 0.975
Epoch 12 | train loss: 0.0629, train acc: 0.978 | val loss: 0.0697, val acc: 0.978
Epoc

In [25]:
test_loss, test_acc = run_epoch(test_loader, model, optimizer=None)
print(f"Test loss: {test_loss:.4f}, Test acc: {test_acc:.3f}")

Test loss: 0.0471, Test acc: 0.984


In [26]:
test_loader

<torch.utils.data.dataloader.DataLoader at 0x2e1f3903750>

In [27]:
for tl in test_loader:
    print(tl)

[tensor([[-0.3316,  1.0142,  0.6632,  ..., -0.1706,  0.7511,  0.3233],
        [-0.3875, -1.1903, -0.1594,  ...,  1.0795, -0.6394,  0.3233],
        [-0.0241,  0.6765,  0.3724,  ..., -0.4206,  0.7511,  0.3233],
        ...,
        [-0.4434, -1.6303, -1.2750,  ...,  1.9130, -2.7251,  0.3233],
        [-0.3595, -0.7420, -0.1901,  ..., -0.5873,  0.7511,  0.3233],
        [ 0.2834,  0.1387,  0.6547,  ..., -0.5873,  0.7511,  0.3233]]), tensor([1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
        1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
        1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,
        0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
    