<a href="https://colab.research.google.com/github/gopinathak-geek/novozymes-enzyme-stability-prediction/blob/main/NovoModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_csv = "https://raw.githubusercontent.com/gopinathak-geek/novozymes-enzyme-stability-prediction/main/data/train.csv"
train_updates_csv = "https://raw.githubusercontent.com/gopinathak-geek/novozymes-enzyme-stability-prediction/main/data/train_updates_20220929.csv"
test_csv = "https://raw.githubusercontent.com/gopinathak-geek/novozymes-enzyme-stability-prediction/main/data/test.csv"

train_df = pd.read_csv(train_csv)
train_updates_df = pd.read_csv(train_updates_csv)
test_df = pd.read_csv(test_csv)

In [3]:
seq_ids_to_delete = train_updates_df[train_updates_df["pH"].isnull()]["seq_id"].values
seq_ids_to_replace = train_updates_df[train_updates_df["pH"].notnull()]["seq_id"].values

In [4]:
train_df.drop(train_df[train_df.seq_id.isin(seq_ids_to_delete)].index, inplace=True)

In [5]:
train_df.loc[train_df.seq_id.isin(seq_ids_to_replace), ["pH", "tm"]] = train_updates_df[train_updates_df.seq_id.isin(seq_ids_to_replace)][["pH", "tm"]].values

In [6]:
null_seq_ids = train_df[train_df["pH"].isnull()]["seq_id"].values

In [7]:
train_df.drop(train_df[train_df.seq_id.isin(null_seq_ids)].index, inplace=True)

In [8]:
train_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28695 entries, 0 to 31389
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   seq_id            28695 non-null  int64  
 1   protein_sequence  28695 non-null  object 
 2   pH                28695 non-null  float64
 3   data_source       27727 non-null  object 
 4   tm                28695 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.3+ MB


In [9]:
train_df.describe().transpose()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seq_id,28695.0,15631.432166,9235.099078,0.0,7447.5,15443.0,23758.5,31389.0
pH,28695.0,6.872467,0.793184,1.99,7.0,7.0,7.0,11.0
tm,28695.0,51.385604,12.076609,25.1,43.7,48.8,54.6,130.0


In [10]:
import sys
print (sys.version)


3.7.15 (default, Oct 12 2022, 19:14:55) 
[GCC 7.5.0]


In [11]:
import torch
import random
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from scipy import stats
import torchvision.transforms as T


In [12]:
def getMolecularWeight(aminoacid):
  if aminoacid == "A": return 89
  elif aminoacid == "R": return 174
  elif aminoacid == "N": return 132
  elif aminoacid == "D": return 133
  elif aminoacid == "B": return 133
  elif aminoacid == "C": return 121
  elif aminoacid == "Q": return 146
  elif aminoacid == "E": return 147
  elif aminoacid == "Z": return 147
  elif aminoacid == "G": return 75
  elif aminoacid == "H": return 155
  elif aminoacid == "I": return 131
  elif aminoacid == "L": return 131
  elif aminoacid == "K": return 146
  elif aminoacid == "M": return 149
  elif aminoacid == "F": return 165
  elif aminoacid == "P": return 115
  elif aminoacid == "S": return 105
  elif aminoacid == "T": return 119
  elif aminoacid == "W": return 204
  elif aminoacid == "Y": return 181
  elif aminoacid == "V": return 117
  else: return 0

In [13]:
def proteinSequenceToAmioAcidMolecularWeightWithPh(data):
  amionAcidSequence = []
  img = []
  label = []
  for index, row in data.iterrows():
    ph = row['pH']
    tm = row['tm']
    molecularWeightOfAminoAcidInSequence = []
    pHofProteinSequence = []
    for aminoacid in (row['protein_sequence']):
      molecularWeight = getMolecularWeight(aminoacid)
      molecularWeightOfAminoAcidInSequence.append(molecularWeight/110)
      pHofProteinSequence.append(ph)
    #a = torch.FloatTensor(np.array([molecularWeightOfAminoAcidInSequence,pHofProteinSequence]))
    #a = a[None, :]
    #a = preprocess(np.array([molecularWeightOfAminoAcidInSequence,pHofProteinSequence]))
    a = np.array([molecularWeightOfAminoAcidInSequence,pHofProteinSequence])
    a = a[None, :]
    img.append(a)
    label.append(tm)
    #amionAcidSequence.append((a, tm))
    #break
  return [img, label]

In [14]:
img, label = proteinSequenceToAmioAcidMolecularWeightWithPh(train_df)

In [15]:
import torchvision.transforms as T
from PIL import Image

preprocess = T.Compose([
   T.Resize((2,10)),
])

def change_shape(data):
    torch_img = []
    for im in (data):
        im = torch.FloatTensor(im)
        #transform = T.ToPILImage()
        #img = transform(im)
        x = preprocess(im)
        x = x.cpu().detach().numpy()
        torch_img.append(x)
    return torch_img

In [16]:
img = change_shape(img)

In [17]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [18]:
train_data = TrainData(torch.FloatTensor(img), torch.FloatTensor(label))

  """Entry point for launching an IPython kernel.


In [19]:
image, label = train_data[2]
print(image.shape, label)

torch.Size([1, 2, 10]) tensor(40.5000)


In [20]:
train_size = int(0.8 * len(train_data))
val_size = len(train_data) - train_size
training_data, validation_data = torch.utils.data.random_split(train_data, [train_size, val_size])

In [21]:
BATCH_SIZE = 128
training_loader = DataLoader(dataset=training_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
validation_loader = DataLoader(dataset=validation_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

In [22]:
class NovoNet(nn.Module):
    def __init__(self, name=None):
        super(NovoNet, self).__init__()
        if name:
            self.name = name
        self.conv1 = nn.Conv2d(1, 10, 2) 
        #self.pool = nn.MaxPool2d((3, 2), stride=(2, 2))
        self.fc1 = nn.Linear(10 * 9 * 1, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 1)
        
        # compute the total number of parameters
        total_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        print(self.name + ': total params:', total_params)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = x.view(-1, 9 * 1 * 10)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [24]:
net = NovoNet(name='Novonet')
net.to(device)

Novonet: total params: 21219


NovoNet(
  (conv1): Conv2d(1, 10, kernel_size=(2, 2), stride=(1, 1))
  (fc1): Linear(in_features=90, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=1, bias=True)
)

In [25]:
LEARNING_RATE = 0.001
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)

In [28]:
EPOCHS = 50
for epoch in range(EPOCHS):
    training_score = []
    for img, label in training_loader:
      net.train()
      img, label = img.to(device), label.to(device)
      label = label.unsqueeze(1)
    
      #==========Forward pass===============
      preds = net(img)
      loss = criterion(preds, label)
      #==========backward pass==============

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()


      train_result = stats.spearmanr(preds.detach().cpu().numpy(), label.cpu().numpy())
      training_score.append(train_result.correlation)
        
    validation_score = []
    for img, label in validation_loader:
      net.eval()
      img, label = img.to(device), label.to(device)
      label = label.unsqueeze(1)
      val_preds = net(img)
      val_result = stats.spearmanr(val_preds.detach().cpu().numpy(), label.cpu().numpy())
      validation_score.append(val_result.correlation)

    print(f'{epoch+1:03} EPOCH scores -  Training score : {np.mean(training_score):.5f} | Validation score : {np.mean(validation_score):.5f}')

001 EPOCH scores -  Training score : 0.16902 | Validation score : 0.15480
002 EPOCH scores -  Training score : 0.16847 | Validation score : 0.16094
003 EPOCH scores -  Training score : 0.17212 | Validation score : 0.14883
004 EPOCH scores -  Training score : 0.16994 | Validation score : 0.15505
005 EPOCH scores -  Training score : 0.17205 | Validation score : 0.15965
006 EPOCH scores -  Training score : 0.17112 | Validation score : 0.15678
007 EPOCH scores -  Training score : 0.17321 | Validation score : 0.15777
008 EPOCH scores -  Training score : 0.17161 | Validation score : 0.15678
009 EPOCH scores -  Training score : 0.17295 | Validation score : 0.15970
010 EPOCH scores -  Training score : 0.17352 | Validation score : 0.15615
011 EPOCH scores -  Training score : 0.17103 | Validation score : 0.16033
012 EPOCH scores -  Training score : 0.17254 | Validation score : 0.14904
013 EPOCH scores -  Training score : 0.17424 | Validation score : 0.15744
014 EPOCH scores -  Training score : 0