In [1]:
import math
from pathlib import Path
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from sklearn.model_selection import KFold

from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from NegativeClassOptimization import ml

ModuleNotFoundError: No module named 'captum'

In [14]:
class CNN(nn.Module):
    def __init__(
        self,
        conv1_num_filters=5,
        conv1_filter_size=3,
        conv2_num_filters=3,
        conv2_filter_size=3,
    ):
        super().__init__()

        # ConvNet Calculator
        # https://madebyollin.github.io/convnet-calculator/

        # input: 67(W) x 100(H) x 1(#C)

        self.conv1 = nn.Conv2d(
            in_channels=1,
            out_channels=conv1_num_filters,  # filter count
            kernel_size=conv1_filter_size,  # filter size
        )
        conv1_out_w = math.floor((67 - conv1_filter_size) / 1 + 1)
        conv1_out_h = math.floor((100 - conv1_filter_size) / 1 + 1)

        self.pool = nn.MaxPool2d(
            kernel_size=2,  # filter size
            stride=2,
        )
        pool1_out_w = math.floor((conv1_out_w - 2) / 2 + 1)
        pool1_out_h = math.floor((conv1_out_h - 2) / 2 + 1)

        self.conv2 = nn.Conv2d(
            in_channels=conv1_num_filters,
            out_channels=conv2_num_filters,  # filter count
            kernel_size=conv2_filter_size,
        )
        conv2_out_w = math.floor((pool1_out_w - conv2_filter_size) / 1 + 1)
        conv2_out_h = math.floor((pool1_out_h - conv2_filter_size) / 1 + 1)

        pool2_out_w = math.floor((conv2_out_w - 2) / 2 + 1)
        pool2_out_h = math.floor((conv2_out_h - 2) / 2 + 1)
        fc1_in_features = pool2_out_w * pool2_out_h * conv2_num_filters
        self.fc1 = nn.Linear(fc1_in_features, 10)
        self.fc2 = nn.Linear(10, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, forward_logits=False):
        x = self.conv1(x)
        x = self.pool(F.relu(x))
        x = self.conv2(x)
        x = self.pool(F.relu(x))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        if forward_logits:
            return x
        else:
            x = self.sigmoid(x)
            return x

    def forward_logits(self, x):
        return self.forward(x, forward_logits=True)

    def compute_metrics_closed_testset(self, x_test, y_test):
        x_test_cnn = x_test.reshape((-1, 1, 67, 100))
        return ml.SN10.compute_metrics_closed_testset_static(self, x_test_cnn, y_test)


In [15]:
def construct_optimizer(
    optimizer_type,
    learning_rate,
    momentum,
    weight_decay,
    model,
) -> torch.optim.Optimizer:
    if optimizer_type == "SGD":
        optimizer = torch.optim.SGD(
            model.parameters(),
            lr=learning_rate,
            momentum=momentum,
            weight_decay=weight_decay,
        )
    elif optimizer_type == "Adam":
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=learning_rate,
            betas=(momentum, 0.999),  # beta1 ~ momentum
            weight_decay=weight_decay,
        )
    else:
        raise ValueError(f"optimizer_type `{optimizer_type}` not recognized.")
    return optimizer


In [25]:
def train_loop(loader, model, loss_fn, optimizer):
    """Basic training loop for pytorch.

    Args:
        loader (DataLoader)
        model (nn.Model)
        loss_fn (Callable)
        optimizer
    """
    losses = []
    size = len(loader.dataset)
    for batch, (X, y) in enumerate(loader):
        X_pred = model(X.reshape(-1, 1, 67, 100))
        loss = loss_fn(X_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            losses.append(loss)
    return losses

def train_for_ndb1(
    epochs,
    learning_rate,
    train_loader,
    model,
    optimizer_type: str,
    momentum: float = 0,
    weight_decay: float = 0,
    callback_on_model_end_epoch: callable = None,
):

    loss_fn = nn.BCELoss()
    optimizer = construct_optimizer(
        optimizer_type, learning_rate, momentum, weight_decay, model
    )


    if callback_on_model_end_epoch is None:
        callback_on_model_end_epoch = lambda x, t: None

    online_metrics_per_epoch = []
    for t in range(epochs):
        losses = train_loop(train_loader, model, loss_fn, optimizer)
        online_metrics_per_epoch.append(
            {
                "train_losses": losses,}
        )

        callback_on_model_end_epoch(model, t)

    return online_metrics_per_epoch

In [26]:
SMISET = {"C": 67, "l": 1, ".": 2, "c": 3, "1": 4, "2": 5, "(": 6,
          "N": 7, "=": 8, "3": 9, ")": 10, "n": 11, "[": 12, "H": 13,
           "]": 14, "O": 15, "@": 16, "s": 17, "+": 18, "/": 19, "S": 20,
            "F": 21, "-": 22, "4": 23, "B": 24, "r": 25, "o": 26, "\\": 27,
             "#": 28, "5": 29, "a": 30, "P": 31, "e": 32, "6": 33, "7": 34,
              "I": 35, "A": 36, "i": 37, "8": 38, "9": 39, "Z": 40, "K": 41,
               "L": 42, "%": 43, "0": 44, "T": 45, "g": 46, "G": 47, "d": 48,
                "M": 49, "b": 50, "u": 51, "t": 52, "R": 53, "p": 54, "m": 55,
                 "W": 56, "Y": 57, "V": 58, "~": 59, "U": 60, "E": 61, "f": 62,
                  "X": 63, "D": 64, "y": 65, "h": 66}

def one_hot_matrix_smiles(line, MAX_SMI_LEN=100):
    """
    Converts a SMILES string into a one-hot matrix 67*100. If smile is shorter than MAX_SMI_LEN, it is padded with zeros.
    """
    X = np.zeros((MAX_SMI_LEN, len(SMISET)))  # +1

    if type(line)!=str:
        print('SMILE format is not str!')
    for i, ch in enumerate(line[:MAX_SMI_LEN]):
        tmp=SMISET.get(ch)
        if tmp:
            X[i, tmp - 1] = 1
        else:
            print(line,'exits not in SMISET character',ch)
    return X

In [27]:
def load_custom_data(df, X_col='X', y_col='Y_binary'):

    X = np.array(df[X_col].tolist())
    y = np.array(df[y_col].tolist())
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)
    dataset = TensorDataset(X_tensor, y_tensor)
    return dataset

In [28]:
#lets try with one dataset
target = "P06239"
task = "vs_Weak"
split = 0

path_to_target = Path('./data/processed') / target
path_vs_task_split = path_to_target / task / f'split_{split}'
# Loading data
path_to_train = path_vs_task_split / 'train.pkl'
path_to_test = path_vs_task_split / 'test.pkl'

In [29]:
df_train = pd.read_pickle(path_to_train)
df_test = pd.read_pickle(path_to_test)

In [30]:
df_train["X_matrix"] = df_train.Drug.apply(one_hot_matrix_smiles)
df_test["X_matrix"] = df_test.Drug.apply(one_hot_matrix_smiles)

In [31]:
dataset_train = load_custom_data(df_train, X_col="X_matrix")
dataset_test = load_custom_data(df_test, X_col="X_matrix")

train_loader = DataLoader(dataset_train, batch_size=8, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=8, shuffle=False)


In [34]:
epochs =20
learning_rate = 0.001
optimizer_type = "Adam"
momentum = 0.9
model = CNN()

train_output = train_for_ndb1(
        epochs=epochs,
        learning_rate=learning_rate,
        train_loader=train_loader,
        model=model,
        optimizer_type=optimizer_type,
        momentum=momentum,
    )

In [37]:
train_output[0]

{'train_losses': [0.6677110195159912]}

In [None]:
def test_loop(loader, model, loss_fn) -> dict:
    """Basic test loop for pytorch.

    Args:
        loader (DataLoader)
        model (nn.Model)
        loss_fn (Callable)
    """


    test_loss = compute_avg_test_loss(loader, model, loss_fn)

    loop_metrics = {
        "test_loss": test_loss,
    }

    x_test, y_test = Xy_from_loader(loader=loader)
    closed_metrics: dict = compute_metrics_closed_testset(model, x_test, y_test)

    acc_closed = closed_metrics.get("acc_closed", np.nan)
    #print(f"Test Error: \n Acc: {100*acc_closed:.1f} Avg loss: {test_loss:>8f} \n")

    return {
        **loop_metrics,
        **closed_metrics,
    }

def compute_avg_test_loss(loader, model, loss_fn):
    num_batches = len(loader)
    test_loss = 0
    with torch.no_grad():
        for X, y in loader:
            test_loss += compute_loss(model, loss_fn, X, y).item()
    test_loss /= num_batches
    return test_loss