In [34]:
import numpy as np
import pandas as pd

import zipfile

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler

import torch
from torch.utils.data import DataLoader, TensorDataset

from src.model_implementations import split_by_labels, Triple_Net
from src.implementations import *

In [35]:
labeled_data = pd.read_csv('labeled_data_BERT.csv')

# <u><b> Prediction Models: </b></u>

In [36]:
N_SPLIT = 9
KSPLIT = N_SPLIT - 1

In [37]:
GAGPOL_data = labeled_data[labeled_data['Target Name'] == 'Gag-Pol polyprotein [489-587]']

In [38]:
train, test = split_by_labels(GAGPOL_data,N_SPLIT)
y_train, y_test = np.array(train['Ki (nM)'], dtype=np.float64()).reshape(-1, 1), np.array(test['Ki (nM)'], dtype=np.float64()).reshape(-1, 1)
X_train, X_test = train.drop(columns=['Ki (nM)','Ligand SMILES','Labels', 'Target Name'], axis=1), test.drop(columns=['Ki (nM)','Ligand SMILES','Labels', 'Target Name'], axis=1)

scaler = StandardScaler()
pca = PCA(n_components=256)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

y_train_scaled, y_test_scaled = np.log10(y_train), np.log10(y_test)


'DataFrame.swapaxes' is deprecated and will be removed in a future version. Please use 'DataFrame.transpose' instead.



In [78]:
def plot_error_histogram(predictions, y_test, title="Test Error of Random Assignment", xlabel="Absolute Error of Log Ki", save_path=None):
    """
    Function to plot a histogram of the absolute percentage error with log scale on the x-axis.
    
    Parameters:
    - predictions: ndarray
        The predicted values (1D array).
    - y_test: ndarray
        The true values (1D array).
    - title: str, optional
        The title of the plot (default: 'Test Error of Random Assignment').
    - xlabel: str, optional
        The label for the x-axis (default: 'Absolute Percentage Error').
    - save_path: str, optional
        The path to save the figure as an HTML file (default: None, meaning no save).
    """

    error = np.abs(predictions.ravel() - y_test.ravel())

    error = error[np.isfinite(error)]

    fig = px.histogram(
        x=error,
        nbins=100, 
        labels={'x': xlabel},
        title=title
    )

    fig.update_layout(
        title=dict(text=title, x=0.5, font=dict(size=20)),
        xaxis=dict(
            title=xlabel,
            tickfont=dict(size=12)
        ),
        yaxis=dict(
            title='Count',
            tickfont=dict(size=12)
        ),
        font=dict(size=12),
        width=1000,
        height=600
    )

    fig.show()

    if save_path:
        if not save_path.endswith(".html"):
            save_path += ".html" 
        fig.write_html(save_path, full_html=False, include_plotlyjs='cdn')
        print(f"Figure saved to {save_path}")

In [84]:
pred_random = np.random.choice(np.squeeze(y_test_scaled),len(y_test_scaled),replace=False)

TypeError: choice() got an unexpected keyword argument 'random_state'

In [41]:
lm = Ridge(random_state=42)

gridsearch_lm = GridSearchCV(estimator=lm, param_grid={'alpha':np.logspace(-4,0,20)}, scoring='neg_mean_squared_error', cv=KSPLIT) # no shuffling so ok

gridsearch_lm.fit(X_train, y_train_scaled)
pred_LM = gridsearch_lm.predict(X_test)

In [42]:
rf = RandomForestRegressor(n_estimators=100, max_depth=10,random_state=42)
rf.fit(X_train, y_train_scaled.ravel())
pred_RF = rf.predict(X_test)

In [43]:
XGBoost = GradientBoostingRegressor(n_estimators=100, max_depth=10, random_state=42)
XGBoost.fit(X_train, y_train_scaled.ravel())
pred_XGB = XGBoost.predict(X_test)

In [44]:
num_epochs = 200

X_train_t = torch.tensor(X_train, dtype=torch.float32)
X_test_t = torch.tensor(X_test, dtype=torch.float32)
y_train_t = torch.tensor(y_train_scaled, dtype=torch.float32)

dataset = TensorDataset(X_train_t, y_train_t)
dataloader = DataLoader(dataset, batch_size=len(X_train_t), shuffle=False)

TN = Triple_Net(input_size=X_train.shape[1], K=4096, dropout_rate=0.0)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(TN.parameters(), lr=2e-3, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer,step_size=50,gamma=0.8)

for epoch in range(num_epochs):
    for batch_idx, (data, target) in enumerate(dataloader):

        optimizer.zero_grad()
        output = TN(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.5f}")
    scheduler.step()

Epoch 1/200, Loss: 4.93900
Epoch 2/200, Loss: 180.48598
Epoch 3/200, Loss: 111.79277
Epoch 4/200, Loss: 36.03146
Epoch 5/200, Loss: 11.82403
Epoch 6/200, Loss: 8.02455
Epoch 7/200, Loss: 8.85591
Epoch 8/200, Loss: 9.27364
Epoch 9/200, Loss: 8.40772
Epoch 10/200, Loss: 7.01667
Epoch 11/200, Loss: 5.92645
Epoch 12/200, Loss: 5.40966
Epoch 13/200, Loss: 5.24398
Epoch 14/200, Loss: 5.12298
Epoch 15/200, Loss: 4.88508
Epoch 16/200, Loss: 4.50616
Epoch 17/200, Loss: 4.04624
Epoch 18/200, Loss: 3.59412
Epoch 19/200, Loss: 3.22706
Epoch 20/200, Loss: 2.98446
Epoch 21/200, Loss: 2.86136
Epoch 22/200, Loss: 2.81665
Epoch 23/200, Loss: 2.79385
Epoch 24/200, Loss: 2.74421
Epoch 25/200, Loss: 2.64109
Epoch 26/200, Loss: 2.48107
Epoch 27/200, Loss: 2.28562
Epoch 28/200, Loss: 2.09009
Epoch 29/200, Loss: 1.92762
Epoch 30/200, Loss: 1.81407
Epoch 31/200, Loss: 1.74665
Epoch 32/200, Loss: 1.71169
Epoch 33/200, Loss: 1.69142
Epoch 34/200, Loss: 1.67099
Epoch 35/200, Loss: 1.64172
Epoch 36/200, Loss: 1.5

In [72]:
TN.eval()
with torch.no_grad():
    pred_TN = TN(X_test_t)
    pred_TN = pred_TN.detach().cpu().numpy()

In [90]:
plot_error_histogram(pred_random, y_test_scaled, title="Test Error of Random Assignment (logscaled)", save_path='plots\\random_test')

Figure saved to plots\random_test.html


In [91]:
plot_error_histogram(pred_LM, y_test_scaled, title="Test Error of Linear Model", save_path='plots\\LM_test')

Figure saved to plots\LM_test.html


In [92]:
plot_error_histogram(pred_RF, y_test_scaled, title="Test Error of Random Forest", save_path='plots\\random_forest_test')

Figure saved to plots\random_forest_test.html


In [93]:
plot_error_histogram(pred_XGB, y_test_scaled, title="Test Error of XGBoost", save_path='plots\\XGB_test')

Figure saved to plots\XGB_test.html


In [94]:
plot_error_histogram(pred_TN, y_test_scaled, title="Test Error of Neural Network", xlabel="Absolute Percentage Error", save_path='plots\\random_test')

Figure saved to plots\random_test.html
