# Small Neural Network that takes as input both the aptamer features and the peptide features to predict affinity.

## Generate features for both aptamers and peptides + construct training/test sets

In [1]:
import json
import random
import numpy as np
from sklearn import linear_model, metrics
from sklearn.svm import SVC
from scipy import stats
import random
import re


In [2]:
# Hardcoded values 
d = 150
samples = 1000
k_apt = 4
k_pep = 4

In [3]:
'''
Function to classify binding affinity of a sample. 
'''
def classify_affinity(affinity):
    if float(affinity) <= 9:
        return 0
    elif float(affinity) <= 50:
        return 1
    elif float(affinity) <= 400:
        return 2
    return 3

In [4]:
dataset_file = "../data/mhcflurry_dataset.json"
'''
Constructs a dataset that has 10,000 pairs for every class of binding affinity. 
'''
def construct_dataset():
    with open(dataset_file, 'r') as f:
        mhcflurry_data = json.load(f)
    
    # Full dataset. The index of the list corresponds to the binding affinity class
    full_dataset = [[], [], [], []]
    for allele in mhcflurry_data:
        peptides = mhcflurry_data[allele]
        for p, b in peptides:
            affinity_class = classify_affinity(b)
            full_dataset[affinity_class].append((allele, p))
    
    subsampled_dataset = [[], [], [], []]
    
    for i in range(len(full_dataset)):
        full_class = np.asarray(full_dataset[i])
        # Sample the hardcoded number of samples pairs randomly
        subsampled_dataset[i] = np.copy(full_class[np.random.choice(full_class.shape[0], samples, replace=False), :])
    
    subsampled_dataset = np.asarray(subsampled_dataset)    
    return subsampled_dataset

In [5]:
subsampled_dataset = construct_dataset()

In [6]:
'''
Extracts features from the subsampled dataset
'''
def extract_features(dataset):
    # Number of features
    aptamer_features = [[], [], [], []]
    peptide_features = [[], [], [], []]
    
    for i in range(dataset.shape[0]):
        flattened = dataset[i].flatten('F')
        all_aptamers = flattened[:samples]
        all_peptides = flattened[samples:]
        
        split = int(0.8*len(all_aptamers))
        all_aptamers = all_aptamers[:split]
        all_peptides = all_peptides[:split]
        
        # Generate the aptamer features randomly
        for j in range(d):
            # Find a random aptamer
            apt = random.choice(all_aptamers)

            # Find a random subsection of k elements from this sequence
            start = random.randint(0, len(apt)-k_apt)
            aptamer_features[i].append(apt[start:start+k_apt])
    
        # Generate the peptide features randomly
        for j in range(d):
            # Find a random aptamer
            pep = random.choice(all_peptides)

            # Find a random subsection of k elements from this sequence
            start = random.randint(0, len(pep)-k_pep)
            peptide_features[i].append(pep[start:start+k_pep])
    
    
    return aptamer_features, peptide_features, split
  

In [7]:
aptamer_features, peptide_features, split = extract_features(subsampled_dataset)

In [8]:
'''
Generates training and testing sets. Training is the first 8000 samples, test is the last 2000 samples. 
'''
def construct_train_test_sets(aptamer_features, peptide_features):
    train_pairs = [[], [], [], []]
    test_pairs = [[], [], [], []]
    
    for c in range(len(subsampled_dataset)):
        train_pairs[c] = subsampled_dataset[c][:split]
        test_pairs[c] = subsampled_dataset[c][split:]
    
    train_pairs = np.asarray(train_pairs)
    test_pairs = np.asarray(test_pairs)
    
    train_aptamers = [[], [], [], []]
    test_aptamers = [[], [], [], []]
    
    train_peptides = [[], [], [], []]
    test_peptides = [[], [], [], []]
    
    # Make a 0/1 matrix for the training aptamers/peptides
    for i in range(len(train_aptamers)):
        pairs = train_pairs[i]
        apt_features = aptamer_features[i]
        pep_features = peptide_features[i]
        
        for j in range(len(pairs)):
            a, p = pairs[j]
            matrix_aptamer = []
            matrix_peptide = []
            
            for k in range(len(apt_features)):
                feat = apt_features[k]
                if feat in a:
                    matrix_aptamer.append(1)
                else:
                    matrix_aptamer.append(0)
            train_aptamers[i].append(matrix_aptamer)
            
            for k in range(len(pep_features)):
                feat = pep_features[k]
                if feat in p:
                    matrix_peptide.append(1)
                else:
                    matrix_peptide.append(0)
            train_peptides[i].append(matrix_peptide)
                
    train_aptamers = np.asarray(train_aptamers)
    train_peptides = np.asarray(train_peptides)
    print("Train Aptamers Shape: ", train_aptamers.shape)
    print("Train Peptides Shape: ", train_peptides.shape)
    
    # Make a 0/1 matrix for the testing aptamers/peptides
    for i in range(len(test_aptamers)):
        pairs = test_pairs[i]
        apt_features = aptamer_features[i]
        pep_features = peptide_features[i]
        
        for j in range(len(pairs)):
            a, p = pairs[j]
            matrix_aptamer = []
            matrix_peptide = []
            
            for k in range(len(apt_features)):
                feat = apt_features[k]
                if feat in a:
                    matrix_aptamer.append(1)
                else:
                    matrix_aptamer.append(0)
            test_aptamers[i].append(matrix_aptamer)
            
            for k in range(len(pep_features)):
                feat = pep_features[k]
                if feat in p:
                    matrix_peptide.append(1)
                else:
                    matrix_peptide.append(0)
            test_peptides[i].append(matrix_peptide)
                
    test_aptamers = np.asarray(test_aptamers)
    test_peptides = np.asarray(test_peptides)
    print("Test Aptamers Shape: ", test_aptamers.shape)
    print("Test Peptides Shape: ", test_peptides.shape)
    
    return train_aptamers, train_peptides, test_aptamers, test_peptides
    
    
    

In [9]:
train_aptamers, train_peptides, test_aptamers, test_peptides = construct_train_test_sets(aptamer_features, peptide_features)

Train Aptamers Shape:  (4, 800, 150)
Train Peptides Shape:  (4, 800, 150)
Test Aptamers Shape:  (4, 200, 150)
Test Peptides Shape:  (4, 200, 150)


## Construct a Pytorch DataLoader

In [10]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

In [11]:
class AptamerPeptideDataset(Dataset):
    '''
    @param: peptides = n*m
    @param: aptamers = n*m
    @param: affinities = n*1
    '''
    def __init__(self, peptides, aptamers, affinities):
        self.peptides = peptides
        self.aptamers = aptamers
        affinities = np.reshape(affinities, (affinities.shape[0], 1))
        self.affinities = affinities
    
    def __len__(self):
        return self.peptides.shape[0]
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        pep = self.peptides[idx]
        apt = self.aptamers[idx]        
        aff_class = self.affinities[idx]
        
        sample = {'peptide': pep, 'aptamer': apt, 'affinity': aff_class}
        
        return sample
        
        

In [12]:
# Reshape the dataset to fit the dataset class
all_aptamers = []
for i in range(len(train_aptamers)):
    all_aptamers.extend(train_aptamers[i])
    all_aptamers.extend(test_aptamers[i])

# n * m 
all_aptamers = np.array(all_aptamers)
print("Aptamers Shape: ", all_aptamers.shape)

all_peptides = []
for i in range(len(train_peptides)):
    all_peptides.extend(train_peptides[i])
    all_peptides.extend(test_peptides[i])

# n * m 
all_peptides = np.array(all_peptides)
print("Peptides Shape: ", all_peptides.shape)

# n * 1
affinity_classes = np.repeat(np.array([[0, 1], [2, 3]]), 1000)

print("Affinities Shape: ", affinity_classes.shape)

Aptamers Shape:  (4000, 150)
Peptides Shape:  (4000, 150)
Affinities Shape:  (4000,)


In [13]:
dataset = AptamerPeptideDataset(all_peptides, all_aptamers, affinity_classes)

In [14]:
# Test the dataset class
for i in range(len(dataset)):
    sample = dataset[i]
    #print(i, sample['peptide'].shape, sample['aptamer'].shape, sample['affinity'])

In [15]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, num_workers=1)

## Construct a small neural network

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

In [35]:
# Define the network
class SmallNN(nn.Module):
    def __init__(self):
        super(SmallNN, self).__init__()
        self.fc1 = nn.Linear(150, 1024)
        self.prelu1 = nn.PReLU(num_parameters=1)
        self.fc2 = nn.Linear(1024, 512)
        self.prelu2 = nn.PReLU(num_parameters=1)
        self.lin1 = nn.Linear(512, 250)
        self.prelu3 = nn.PReLU(num_parameters=1)
        self.lin2 = nn.Linear(250, 100)
        self.fc3 = nn.Linear(250, 4)
        self.sequential = nn.Sequential(self.fc1, self.prelu1, self.fc2, self.prelu2, self.lin1, self.prelu3, self.fc3)
        self.fc4 = nn.Linear(8, 4)
       
    def forward(self, apt, pep):
        apt = apt.view(-1, 150)
        pep = pep.view(-1, 150)
        print("Apt type:", apt.float())
        apt = self.sequential(apt)
        pep = self.sequential(pep)
        x = torch.cat((apt, pep), 1)
        print("X shape: ", x.shape)
        x = F.log_softmax(x, dim=1)
        return x  

In [36]:
model = SmallNN()
optimizer = Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [37]:
# Training loop
for epoch in range(10):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(dataloader):
        pep = data['peptide']
        apt = data['aptamer']
        label = data['affinity']
        optimizer.zero_grad()
        outputs = model(pep, apt)
        loss = loss_fn(outputs, label)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print('Finished Training')

Apt type: tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0.]])


RuntimeError: Expected object of scalar type Float but got scalar type Long for argument #2 'mat1' in call to _th_addmm