In [1]:
import os, sys
import numpy as np
import json
import random

## Prepare the data --> one hot encoding matrices

In [2]:
aptamer_dataset_file = "../data/aptamer_dataset.json"
'''
Constructs a dataset that has 10,000 pairs for every class of binding affinity. 
'''
def construct_dataset():
    with open(aptamer_dataset_file, 'r') as f:
        aptamer_data = json.load(f)
    
    # Full dataset. The index of the list corresponds to the binding affinity class
    full_dataset = []
    for aptamer in aptamer_data:
        peptides = aptamer_data[aptamer]
        for p, b in peptides:
            if len(aptamer) == 40 and len(p) == 8:
                full_dataset.append((aptamer, p))
    
    return full_dataset

In [3]:
full_dataset = construct_dataset()
random.shuffle(full_dataset)
training_set = full_dataset[:int(0.8*len(full_dataset))]
test_set = full_dataset[int(0.8*len(full_dataset)):]

In [4]:
## Takes a peptide and aptamer sequence and converts to one-hot matrix
def one_hot(sequence, seq_type='peptide'):
    if seq_type == 'peptide':
        aa_list = ['R', 'L', 'S', 'A', 'G', 'P', 'T', 'V', 'N', 'D', 'C', 'Q', 'E', 'H', 'I', 'K', 'M', 'F', 'W', 'Y']
        one_hot_peptide = np.zeros((len(sequence), len(aa_list)))
        for i in range(len(sequence)):
            aa = sequence[i]
            idx = aa_list.index(aa)
            one_hot_peptide[i][idx] = 1
        return one_hot_peptide
    else:
        na_list = ['A', 'C', 'G', 'T']
        one_hot_aptamer = np.zeros((len(sequence), len(na_list)))
        for i in range(len(sequence)):
            na = sequence[i]
            idx = na_list.index(na)
            one_hot_aptamer[i][idx] = 1
        return one_hot_aptamer
        
        

## Model --> CNN

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD

In [6]:
# Define the model
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.cnn_apt_1 = nn.Conv2d(1, 40, 1)
        self.cnn_apt_2 = nn.Conv2d(40, 10, 1)
        self.cnn_apt_3 = nn.Conv2d(10, 1, 1)
        self.fc_apt_1 = nn.Linear(160, 1)
        
        self.cnn_pep_1 = nn.Conv2d(1, 8, 1)
        self.cnn_pep_2 = nn.Conv2d(8, 1, 1)
        self.fc_pep_1 = nn.Linear(64, 1)
        
        self.pool = nn.MaxPool2d(2, 1)
        
        self.relu = nn.ReLU()
        
        self.sequential_pep = nn.Sequential(self.cnn_pep_1, self.relu, self.pool, self.cnn_pep_2)
        self.sequential_apt = nn.Sequential(self.cnn_apt_1, self.relu, self.pool, self.cnn_apt_2, self.relu, self.pool, self.cnn_apt_3)
        
        self.fc1 = nn.Linear(209, 1)
        
    def forward(self, apt, pep):
        apt = self.sequential_apt(apt)
        pep = self.sequential_pep(pep)
        
        apt = apt.view(-1, 1).T
        pep = pep.view(-1, 1).T
        
        x = torch.cat((apt, pep), 1)
        x = self.fc1(x)
        x = F.sigmoid(x)
        return x
    def loss(self, prediction, label):
        l = nn.MSELoss()
        label = torch.FloatTensor(label)
        label = label.reshape((1, 1))
        return l(torch.FloatTensor(prediction), label)

In [7]:
model = ConvNet()
def weights_init(m):
    if isinstance(m, nn.Conv2d):
        nn.init.xavier_uniform_(m.weight.data)
        nn.init.ones_(m.bias.data)
model.apply(weights_init)
optimizer = Adam(model.parameters(), lr=0.00001, weight_decay=0.2)

In [8]:
# Training Loop
import tqdm
for epoch in range(1):
    print("Epoch: ", epoch)
    model.train()
    running_loss = 0.0
    # Come up with a trainloader
    for i, data in enumerate(tqdm.tqdm(training_set[:10000])):
        # Peptide and aptamer, one-hot encode them
        pep = training_set[i][1]
        apt = training_set[i][0]
        
        pep = one_hot(pep, seq_type='peptide')
        apt = one_hot(apt, seq_type='aptamer')
        
        pep = torch.FloatTensor(np.reshape(pep, (1, 1, pep.shape[0], pep.shape[1])))
        apt = torch.FloatTensor(np.reshape(apt, (1, 1, apt.shape[0], apt.shape[1])))
        
        output = model(apt, pep)
        loss = model.loss(output, 1)
        optimizer.zero_grad()
        running_loss += loss.item()
        if i % 200 == 199:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
        optimizer.step()
        
    
print('Finished Training')

  0%|          | 20/10000 [00:00<00:51, 192.87it/s]

Epoch:  0


  2%|▏         | 231/10000 [00:01<00:43, 224.36it/s]

[1,   200] loss: inf


  4%|▍         | 429/10000 [00:01<00:39, 242.00it/s]

[1,   400] loss: inf


  6%|▋         | 628/10000 [00:02<00:38, 245.31it/s]

[1,   600] loss: inf


  8%|▊         | 840/10000 [00:03<00:35, 256.69it/s]

[1,   800] loss: inf


 10%|█         | 1023/10000 [00:04<00:35, 253.74it/s]

[1,  1000] loss: inf


 12%|█▏        | 1247/10000 [00:05<00:36, 238.94it/s]

[1,  1200] loss: inf


 14%|█▍        | 1438/10000 [00:06<00:39, 218.29it/s]

[1,  1400] loss: inf


 16%|█▋        | 1629/10000 [00:06<00:35, 233.34it/s]

[1,  1600] loss: inf


 18%|█▊        | 1843/10000 [00:07<00:35, 230.32it/s]

[1,  1800] loss: inf


 19%|█▊        | 1867/10000 [00:08<00:34, 233.13it/s]


KeyboardInterrupt: 

## Evaluation --> compare to random

In [None]:
correct = 0
incorrect = 0
for i, data in enumerate(tqdm.tqdm(test_set[:10000])):
    pep = test_set[i][1]
    apt = test_set[i][0]
    
    pep = one_hot(pep, seq_type='peptide')
    apt = one_hot(apt, seq_type='aptamer')

    pep = torch.FloatTensor(np.reshape(pep, (1, 1, pep.shape[0], pep.shape[1])))
    apt = torch.FloatTensor(np.reshape(apt, (1, 1, apt.shape[0], apt.shape[1])))

    output = model(apt, pep)
    print("Output: ", output)
    if output > 0.5:
        correct += 1
    else:
        incorrect += 1

print('Accuracy of the network on the test samples: %d %%' % (100* correct/(correct + incorrect)))
    