# Compound Classification Challenge
- input: fingerprint
- model: MLP

In [None]:
!pip install wandb

In [None]:
!pip install rdkit-pypi

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import wandb

import rdkit.Chem as Chem
import rdkit.Chem.AllChem as AllChem

from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm.notebook import tqdm

import sklearn.metrics as metrics
from sklearn.metrics import roc_auc_score

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'{device} is available')

wandb.login()

cuda:0 is available


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# colab 연결
from google.colab import drive
drive.mount('/gdrive')

data_dir = '/gdrive/My Drive/data/compound/'

Mounted at /gdrive


# Data 로드 및 탐색

Let's load the compound data file.

In [None]:
cmpd_df = pd.read_csv(data_dir+'cmpd.csv')

cmpd_df['mol'] = cmpd_df.smiles.apply(Chem.MolFromSmiles)
cmpd_df['mol'] = cmpd_df['mol'].apply(lambda x: Chem.AddHs(x))

cmpd_df.head()

Unnamed: 0,inchikey,smiles,group,activity,mol
0,FNHKPVJBJVTLMP-UHFFFAOYSA-N,CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)...,train,active,<rdkit.Chem.rdchem.Mol object at 0x7ff6657310d0>
1,CUDVHEFYRIWYQD-UHFFFAOYSA-N,CNC(=O)c1cccc2cc(Oc3ccnc4cc(OCC5(N)CC5)c(OC)cc...,train,active,<rdkit.Chem.rdchem.Mol object at 0x7ff665731120>
2,TTZSNFLLYPYKIL-UHFFFAOYSA-N,Cc1cc2cc(Oc3ccnc(Nc4cccc(CS(=O)(=O)NCCN(C)C)c4...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7ff6657315d0>
3,UOVCGJXDGOGOCZ-UHFFFAOYSA-N,COc1cc2c(cc1F)C(c1ccccc1Cl)=Nc1c(n[nH]c1C)N2,train,active,<rdkit.Chem.rdchem.Mol object at 0x7ff665731a30>
4,CUIHSIWYWATEQL-UHFFFAOYSA-N,Cc1ccc(Nc2nccc(N(C)c3ccc4c(C)n(C)nc4c3)n2)cc1S...,test,active,<rdkit.Chem.rdchem.Mol object at 0x7ff6657319e0>


# dataset, dataloader 준비

In [None]:
def get_Xy(df, y_dim):
    X = np.vstack(df.mol.apply(lambda m: list(AllChem.GetMorganFingerprintAsBitVect(m, 4, nBits=2048))))
    y = df.activity.eq('active').astype(float).to_numpy()
    return X, y

In [None]:
X_train, y_train = get_Xy(cmpd_df[cmpd_df.group.eq('train')], y_dim=2)
X_test, y_test = get_Xy(cmpd_df[cmpd_df.group.eq('test')], y_dim=2)

In [None]:
# 데이터를 텐서 형태로 변환
train_X = torch.from_numpy(X_train).float()
train_Y = torch.from_numpy(y_train).float()

test_X = torch.from_numpy(X_test).float()
test_Y = torch.from_numpy(y_test).float()

train_data = TensorDataset(train_X, train_Y)
test_data = TensorDataset(test_X)

# 모델 만들기
- 모델구조
  - 3 fully connected layers
  - batchnorm
  - relu

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()

        self.fc1 = nn.Linear(train_X.shape[1], 64, bias=True)
        self.fc2 = nn.Linear(64, 64, bias=True)
        self.fc3 = nn.Linear(64, 1)

        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.batchnorm1(x)
        x = F.relu(self.fc2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        return x.squeeze(1)

In [None]:
def reinitialize_weights(m):
  """
  모듈의 가중치를 xavier_normal로 초기화
  편차를 0으로 초기화
  """
  if isinstance(m, nn.Conv2d):
    nn.init.xavier_normal_(m.weight.data)
    m.bias.data.fill_(0)

  elif isinstance(m, nn.Linear):
    nn.init.xavier_normal_(m.weight.data)
    m.bias.data.fill_(0)

def calculate_acc(y_pred, y_test):
  """
  epoch별 accuracy 계산
  """
  y_pred_tag = torch.round(torch.sigmoid(y_pred))
  correct_results_sum = (y_pred_tag == y_test).float().sum()
  acc = correct_results_sum/y_test.shape[0]
  acc = torch.round(acc * 100)
  return acc

# Train

In [None]:
def train(model, train_loader, criterion, optimizer, config):
  wandb.watch(model, criterion, log="all", log_freq=10)
  losses = []

  model.train()
  for epoch in tqdm(range(1, config.epochs+1)):
      epoch_loss = 0
      epoch_acc = 0
      for X_batch, y_batch in train_loader:
          X_batch, y_batch = X_batch.to(device), y_batch.to(device)
          optimizer.zero_grad()
          
          y_pred = model(X_batch)
          loss = criterion(y_pred, y_batch)
          
          loss.backward()
          optimizer.step()
          
          # loss
          epoch_loss += loss.item()
          acc = calculate_acc(y_pred, y_batch)
          avg_loss = epoch_loss/len(train_loader)

          # acc
          epoch_acc += acc.item()
          avg_acc = epoch_acc/len(train_loader)
          
          wandb.log({'acc': avg_acc, 'loss': avg_loss}, step=epoch)
          
      if epoch % 10 == 0:
          losses.append(avg_loss)
          print(f'Epoch {epoch+0:03}: | Loss: {avg_loss:.5f} | Acc: {avg_acc:.3f}')

In [None]:
def run(data, config):
  wandb.init(config=config)

  config = wandb.config

  # dataset loader
  train_loader = DataLoader(data, batch_size = config.batch_size, shuffle=True)
  
  model = MLP()
  model.apply(reinitialize_weights)
  model.to(device)
  print(model)

  criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, momentum=0.9)

  train(model, train_loader, criterion, optimizer, config)
  return model

In [None]:
# 하이퍼파라미터 config 세팅
config = {
    'epochs': 100,
    'batch_size': 32,
    'learning_rate': 0.05,
    'dataset': 'compound_df',
    'architecture': 'MLP',
    }

In [None]:
model = run(train_data, config=config)

[34m[1mwandb[0m: Currently logged in as: [33mrimiiii[0m (use `wandb login --relogin` to force relogin)


MLP(
  (fc1): Linear(in_features=2048, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 010: | Loss: 0.43334 | Acc: 86.544
Epoch 020: | Loss: 0.40034 | Acc: 90.048
Epoch 030: | Loss: 0.38709 | Acc: 90.656
Epoch 040: | Loss: 0.38275 | Acc: 91.672
Epoch 050: | Loss: 0.37587 | Acc: 92.376
Epoch 060: | Loss: 0.38713 | Acc: 92.616
Epoch 070: | Loss: 0.37102 | Acc: 93.144
Epoch 080: | Loss: 0.36968 | Acc: 93.544
Epoch 090: | Loss: 0.36903 | Acc: 93.664
Epoch 100: | Loss: 0.36165 | Acc: 94.688


In [None]:
# 모델 저장
file_name = 'MLP_netv3.pth'
torch.save(model.state_dict(), data_dir+file_name)

In [None]:
# 모델 로드
file_name = 'MLP_netv3.pth'
net = MLP()
net.load_state_dict(torch.load(data_dir+file_name))
net.to(device)

MLP(
  (fc1): Linear(in_features=2048, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

# Test

In [None]:
test_loader = DataLoader(test_data, batch_size = 1)

In [None]:
# test 데이터를 이용한 성능 평가
y_pred_list = []

model.eval()
with torch.no_grad():
    i = 0
    for X_batch in test_loader:
      i += 1
      X_batch = X_batch[0].to(device)
      y_pred = model(X_batch)
      y_pred = torch.sigmoid(y_pred)
      y_pred_list.append(y_pred.cpu().numpy())

y_pred_list = np.hstack(y_pred_list)

In [None]:
#roc-auc score
roc_auc_score(y_test, y_pred_list)

0.8426955566362277