<a href="https://colab.research.google.com/github/gopinathak-geek/novozymes-enzyme-stability-prediction/blob/main/NovoModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
print (sys.version)


3.7.15 (default, Oct 12 2022, 19:14:55) 
[GCC 7.5.0]


In [2]:
import torch
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from scipy import stats
import pandas as pd
import torchvision.transforms as T


In [3]:
def getMolecularWeight(aminoacid):
  if aminoacid == "A": return 89
  elif aminoacid == "R": return 174
  elif aminoacid == "N": return 132
  elif aminoacid == "D": return 133
  elif aminoacid == "B": return 133
  elif aminoacid == "C": return 121
  elif aminoacid == "Q": return 146
  elif aminoacid == "E": return 147
  elif aminoacid == "Z": return 147
  elif aminoacid == "G": return 75
  elif aminoacid == "H": return 155
  elif aminoacid == "I": return 131
  elif aminoacid == "L": return 131
  elif aminoacid == "K": return 146
  elif aminoacid == "M": return 149
  elif aminoacid == "F": return 165
  elif aminoacid == "P": return 115
  elif aminoacid == "S": return 105
  elif aminoacid == "T": return 119
  elif aminoacid == "W": return 204
  elif aminoacid == "Y": return 181
  elif aminoacid == "V": return 117
  else: return 0

In [4]:
train_csv = "https://raw.githubusercontent.com/gopinathak-geek/novozymes-enzyme-stability-prediction/main/data/train.csv"
updated_train_csv = "https://raw.githubusercontent.com/gopinathak-geek/novozymes-enzyme-stability-prediction/main/data/train_updates_20220929.csv"
test_csv = "https://raw.githubusercontent.com/gopinathak-geek/novozymes-enzyme-stability-prediction/main/data/test.csv"

training_data = pd.read_csv(train_csv)
updated_training_data = pd.read_csv(updated_train_csv)
testing_data = pd.read_csv(test_csv)

In [5]:
training_data.head()

Unnamed: 0,seq_id,protein_sequence,pH,data_source,tm
0,0,AAAAKAAALALLGEAPEVVDIWLPAGWRQPFRVFRLERKGDGVLVG...,7.0,doi.org/10.1038/s41592-020-0801-4,75.7
1,1,AAADGEPLHNEEERAGAGQVGRSLPQESEEQRTGSRPRRRRDLGSR...,7.0,doi.org/10.1038/s41592-020-0801-4,50.5
2,2,AAAFSTPRATSYRILSSAGSGSTRADAPQVRRLHTTRDLLAKDYYA...,7.0,doi.org/10.1038/s41592-020-0801-4,40.5
3,3,AAASGLRTAIPAQPLRHLLQPAPRPCLRPFGLLSVRAGSARRSGLL...,7.0,doi.org/10.1038/s41592-020-0801-4,47.2
4,4,AAATKSGPRRQSQGASVRTFTPFYFLVEPVDTLSVRGSSVILNCSA...,7.0,doi.org/10.1038/s41592-020-0801-4,49.5


In [6]:
training_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31390 entries, 0 to 31389
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   seq_id            31390 non-null  int64  
 1   protein_sequence  31390 non-null  object 
 2   pH                31104 non-null  float64
 3   data_source       28043 non-null  object 
 4   tm                31390 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 1.2+ MB


In [7]:
training_data.isnull().any(axis=1).sum()

3621

In [8]:
training_data.isnull().any(axis=1).sum()/len(training_data) * 100

11.535520866517999

In [9]:
training_data.nunique()

seq_id              31390
protein_sequence    28981
pH                    115
data_source           324
tm                    770
dtype: int64

In [10]:
dropped_training_data = training_data.dropna()
dropped_training_data.isnull().any(axis=1).sum()

0

In [11]:
def proteinSequenceToAmioAcidMolecularWeightWithPh(data):
  amionAcidSequence = []
  img = []
  label = []
  for index, row in data.iterrows():
    ph = row['pH']
    tm = row['tm']
    molecularWeightOfAminoAcidInSequence = []
    pHofProteinSequence = []
    for aminoacid in (row['protein_sequence']):
      molecularWeight = getMolecularWeight(aminoacid)
      molecularWeightOfAminoAcidInSequence.append(molecularWeight/110)
      pHofProteinSequence.append(ph)
    #a = torch.FloatTensor(np.array([molecularWeightOfAminoAcidInSequence,pHofProteinSequence]))
    #a = a[None, :]
    #a = preprocess(np.array([molecularWeightOfAminoAcidInSequence,pHofProteinSequence]))
    a = np.array([molecularWeightOfAminoAcidInSequence,pHofProteinSequence])
    a = a[None, :]
    img.append(a)
    label.append(tm)
    #amionAcidSequence.append((a, tm))
    #break
  return [img, label]

In [12]:
img, label = proteinSequenceToAmioAcidMolecularWeightWithPh(dropped_training_data)

In [13]:
import torchvision.transforms as T
from PIL import Image

preprocess = T.Compose([
   T.Resize((2,10)),
])

def change_shape(data):
    torch_img = []
    for im in (data):
        im = torch.FloatTensor(im)
        #transform = T.ToPILImage()
        #img = transform(im)
        x = preprocess(im)
        x = x.cpu().detach().numpy()
        torch_img.append(x)
    return torch_img

In [14]:
img = change_shape(img)

In [15]:
class TrainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [16]:
train_data = TrainData(torch.FloatTensor(img), torch.FloatTensor(label))

  """Entry point for launching an IPython kernel.


In [17]:
BATCH_SIZE = 128
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

In [18]:
class NovoNet(nn.Module):
    def __init__(self, name=None):
        super(NovoNet, self).__init__()
        if name:
            self.name = name
        self.conv1 = nn.Conv2d(1, 10, 2) 
        #self.pool = nn.MaxPool2d((3, 2), stride=(2, 2))
        self.fc1 = nn.Linear(10 * 9 * 1, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 1)
        
        # compute the total number of parameters
        total_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        print(self.name + ': total params:', total_params)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = x.view(-1, 9 * 1 * 10)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [20]:
net = NovoNet(name='Novonet')
net.to(device)

Novonet: total params: 21219


NovoNet(
  (conv1): Conv2d(1, 10, kernel_size=(2, 2), stride=(1, 1))
  (fc1): Linear(in_features=90, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=1, bias=True)
)

In [21]:
LEARNING_RATE = 0.001
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)

In [22]:
EPOCHS = 100
for epoch in range(EPOCHS):
    net.train()
    score = []
    for img, label in train_loader:
        img, label = img.to(device), label.to(device)
        label = label.unsqueeze(1)
    
        #==========Forward pass===============
        preds = net(img)
        loss = criterion(preds, label)
        #==========backward pass==============

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        a = stats.spearmanr(preds.detach().cpu().numpy(), label.cpu().numpy())
        score.append(a.correlation)
    print(f'EPOCH score {epoch+1}: {np.mean(score)}')

EPOCH score 1: 0.001701169922371089
EPOCH score 2: 0.005765528791183634
EPOCH score 3: 0.019710662297978788
EPOCH score 4: 0.02003169697966343
EPOCH score 5: 0.023740069271599464
EPOCH score 6: 0.026977081662219987
EPOCH score 7: 0.02950867573489345
EPOCH score 8: 0.03361784003518782
EPOCH score 9: 0.03699029638536945
EPOCH score 10: 0.04012368497623699
EPOCH score 11: 0.046802716096730076
EPOCH score 12: 0.04754613618319898
EPOCH score 13: 0.049775470900098914
EPOCH score 14: 0.05199356045443541
EPOCH score 15: 0.058459032115433594
EPOCH score 16: 0.05878388878004841
EPOCH score 17: 0.06472511060701397
EPOCH score 18: 0.06309141055034966
EPOCH score 19: 0.06437928962389779
EPOCH score 20: 0.06964925927169295
EPOCH score 21: 0.0706873453824744
EPOCH score 22: 0.06641903607819186
EPOCH score 23: 0.0719683001542036
EPOCH score 24: 0.07226649946852522
EPOCH score 25: 0.07259359488555911
EPOCH score 26: 0.07452674320541611
EPOCH score 27: 0.07566359104335148
EPOCH score 28: 0.0763036570421