In [18]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, recall_score,
    matthews_corrcoef, confusion_matrix, accuracy_score)

In [11]:
data = pd.read_csv(r'C:\Users\User\PROJECTS\chem_ai_project\data\strong_or_weak_promoter_nt_embeddings.csv')
data['label'].value_counts()

label
0    1791
1    1591
Name: count, dtype: int64

In [12]:
X = data.iloc[:, 2:].values  
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [13]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)  

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [14]:
class DNK_MLP(nn.Module):
    def __init__(self, input_dim):
        super(DNK_MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),      
        )
        
    def forward(self, x):
        return self.model(x)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DNK_MLP(input_dim=1024).to(device)

criterion = nn.BCEWithLogitsLoss()  
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [24]:
n_epochs = 20

for epoch in range(n_epochs):
    model.train()
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    model.eval()
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X = batch_X.to(device)
            outputs = model(batch_X)
            preds = torch.sigmoid(outputs).cpu().numpy() > 0.5
            all_preds.extend(preds.flatten())
            all_targets.extend(batch_y.numpy())

    acc = accuracy_score(all_targets, all_preds)
    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {acc:.4f}")

y_true = np.array(all_targets).astype(int)
y_pred = np.array(all_preds).astype(int)

Epoch 1/20, Loss: 12.3063, Accuracy: 0.5716
Epoch 2/20, Loss: 11.9680, Accuracy: 0.5820
Epoch 3/20, Loss: 12.3260, Accuracy: 0.5716
Epoch 4/20, Loss: 12.3862, Accuracy: 0.5583
Epoch 5/20, Loss: 13.9599, Accuracy: 0.5465
Epoch 6/20, Loss: 11.9304, Accuracy: 0.5524
Epoch 7/20, Loss: 12.2819, Accuracy: 0.5702
Epoch 8/20, Loss: 12.3722, Accuracy: 0.5613
Epoch 9/20, Loss: 11.5881, Accuracy: 0.5554
Epoch 10/20, Loss: 12.5871, Accuracy: 0.5391
Epoch 11/20, Loss: 13.1193, Accuracy: 0.5524
Epoch 12/20, Loss: 12.0676, Accuracy: 0.5687
Epoch 13/20, Loss: 11.1775, Accuracy: 0.5805
Epoch 14/20, Loss: 11.4144, Accuracy: 0.5702
Epoch 15/20, Loss: 10.1369, Accuracy: 0.5731
Epoch 16/20, Loss: 11.1474, Accuracy: 0.5805
Epoch 17/20, Loss: 11.1800, Accuracy: 0.5583
Epoch 18/20, Loss: 11.0115, Accuracy: 0.5554
Epoch 19/20, Loss: 9.9570, Accuracy: 0.5702
Epoch 20/20, Loss: 10.8983, Accuracy: 0.5672


In [23]:
with torch.no_grad():
    all_probs = []
    for batch_X, _ in test_loader:
        batch_X = batch_X.to(device)
        outputs = model(batch_X)
        probs = torch.sigmoid(outputs).cpu().numpy()
        all_probs.extend(probs.flatten())

all_probs = np.array(all_probs)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

metrics = {
    'roc_auc': roc_auc_score(y_true, all_probs),
    'f1': f1_score(y_true, y_pred),
    'f1_macro': f1_score(y_true, y_pred, average='macro'),
    'f1_micro': f1_score(y_true, y_pred, average='micro'),
    'precision': precision_score(y_true, y_pred),
    'recall': recall_score(y_true, y_pred),
    'mcc': matthews_corrcoef(y_true, y_pred),
    'TN': tn,
    'FP': fp,
    'FN': fn,
    'TP': tp
}

metrics_df = pd.DataFrame([metrics])
metrics_df

Unnamed: 0,roc_auc,f1,f1_macro,f1_micro,precision,recall,mcc,TN,FP,FN,TP
0,0.597256,0.505017,0.556741,0.562777,0.604,0.433908,0.137735,230,99,197,151
