In [1]:
import os, sys
import numpy as np
import json
import random

## Prepare the data --> one hot encoding matrices

In [2]:
aptamer_dataset_file = "../data/aptamer_dataset.json"
'''
Constructs a dataset that has 10,000 pairs for every class of binding affinity. 
'''
def construct_dataset():
    with open(aptamer_dataset_file, 'r') as f:
        aptamer_data = json.load(f)
    
    # Full dataset. The index of the list corresponds to the binding affinity class
    full_dataset = []
    for aptamer in aptamer_data:
        peptides = aptamer_data[aptamer]
        for p, b in peptides:
            if len(aptamer) == 40 and len(p) == 8:
                full_dataset.append((aptamer, p))
    
    return full_dataset

In [3]:
full_dataset = construct_dataset()
random.shuffle(full_dataset)
training_set = full_dataset[:int(0.8*len(full_dataset))]
test_set = full_dataset[int(0.8*len(full_dataset)):]

In [4]:
## Takes a peptide and aptamer sequence and converts to one-hot matrix
def one_hot(sequence, seq_type='peptide'):
    if seq_type == 'peptide':
        aa_list = ['R', 'L', 'S', 'A', 'G', 'P', 'T', 'V', 'N', 'D', 'C', 'Q', 'E', 'H', 'I', 'K', 'M', 'F', 'W', 'Y']
        one_hot_peptide = np.zeros((len(sequence), len(aa_list)))
        for i in range(len(sequence)):
            aa = sequence[i]
            idx = aa_list.index(aa)
            one_hot_peptide[i][idx] = 1
        return one_hot_peptide
    else:
        na_list = ['A', 'C', 'G', 'T']
        one_hot_aptamer = np.zeros((len(sequence), len(na_list)))
        for i in range(len(sequence)):
            na = sequence[i]
            idx = na_list.index(na)
            one_hot_aptamer[i][idx] = 1
        return one_hot_aptamer
        
        

## Model --> CNN

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

In [6]:
# Define the model
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.cnn_apt_1 = nn.Conv2d(1, 40, 1)
        self.cnn_apt_2 = nn.Conv2d(40, 10, 1)
        self.cnn_apt_3 = nn.Conv2d(10, 1, 1)
        self.fc_apt_1 = nn.Linear(160, 1)
        
        self.cnn_pep_1 = nn.Conv2d(1, 8, 3)
        self.cnn_pep_2 = nn.Conv2d(8, 1, 3)
        self.fc_pep_1 = nn.Linear(64, 1)
        
        self.relu = nn.ReLU()
        
        self.sequential_pep = nn.Sequential(self.cnn_pep_1, self.relu, self.cnn_pep_2)
        self.sequential_apt = nn.Sequential(self.cnn_apt_1, self.relu, self.cnn_apt_2, self.relu, self.cnn_apt_3)
        
        self.fc1 = nn.Linear(224, 1)
       
    def forward(self, apt, pep):
        apt = self.sequential_apt(apt)
        pep = self.sequential_pep(pep)
        
        
        apt = apt.view(-1, 1).T
        pep = pep.view(-1, 1).T
        
        x = torch.cat((apt, pep), 1)
        x = self.fc1(x)
        x = F.log_softmax(x, dim=1)
        return x
    def loss(self, prediction, label):
        l = nn.MSELoss()
        return l(prediction, label)

In [7]:
model = ConvNet()
optimizer = Adam(model.parameters(), lr=0.01)

In [None]:
# Training Loop

import tqdm
for epoch in range(1):
    print("Epoch: ", epoch)
    model.train()
    # Come up with a trainloader
    for i, data in enumerate(tqdm.tqdm(training_set)):
        # Peptide and aptamer, one-hot encode them
        pep = training_set[i][1]
        apt = training_set[i][0]
        
        pep = one_hot(pep, seq_type='peptide')
        apt = one_hot(apt, seq_type='aptamer')
        
        pep = torch.FloatTensor(np.reshape(pep, (1, 1, pep.shape[0], pep.shape[1])))
        apt = torch.FloatTensor(np.reshape(apt, (1, 1, apt.shape[0], apt.shape[1])))
        
        output = model(apt, pep)
        loss = model.loss(output, 1)
        loss.backward()
        optimizer.step()
    
print('Finished Training')

  0%|          | 25/620255 [00:00<41:22, 249.88it/s]

Epoch:  0


 36%|███▌      | 221583/620255 [12:28<22:27, 295.81it/s] 

## Evaluation --> compare to random

In [None]:
correct = 0
incorrect = 0
for i, data in enumerate(tqdm.tqdm(testing_set)):
    pep = testing_set[i][1]
    apt = testing_set[i][0]
    
    pep = one_hot(pep, seq_type='peptide')
    apt = one_hot(apt, seq_type='aptamer')

    pep = torch.FloatTensor(np.reshape(pep, (1, 1, pep.shape[0], pep.shape[1])))
    apt = torch.FloatTensor(np.reshape(apt, (1, 1, apt.shape[0], apt.shape[1])))

    output = model(apt, pep)
    if output > 0.5:
        correct += 1
    else:
        incorrect += 1

print('Accuracy of the network on the test samples: %d %%' % (100* correct/(correct + incorrect)))
    