# Small Neural Network that takes as input both the aptamer features and the peptide features to predict affinity.

## Generate features for both aptamers and peptides + construct training/test sets

In [1]:
import json
import random
import numpy as np
from sklearn import linear_model, metrics
from sklearn.svm import SVC
from scipy import stats
import random
import re


In [6]:
# Hardcoded values 
d = 150
samples = 1000
k_apt = 4
k_pep = 4

In [3]:
'''
Function to classify binding affinity of a sample. 
'''
def classify_affinity(affinity):
    if float(affinity) <= 9:
        return 0
    elif float(affinity) <= 50:
        return 1
    elif float(affinity) <= 400:
        return 2
    return 3

In [4]:
dataset_file = "../data/mhcflurry_dataset.json"
'''
Constructs a dataset that has 10,000 pairs for every class of binding affinity. 
'''
def construct_dataset():
    with open(dataset_file, 'r') as f:
        mhcflurry_data = json.load(f)
    
    # Full dataset. The index of the list corresponds to the binding affinity class
    full_dataset = [[], [], [], []]
    for allele in mhcflurry_data:
        peptides = mhcflurry_data[allele]
        for p, b in peptides:
            affinity_class = classify_affinity(b)
            full_dataset[affinity_class].append((allele, p))
    
    subsampled_dataset = [[], [], [], []]
    
    for i in range(len(full_dataset)):
        full_class = np.asarray(full_dataset[i])
        # Sample the hardcoded number of samples pairs randomly
        subsampled_dataset[i] = np.copy(full_class[np.random.choice(full_class.shape[0], samples, replace=False), :])
    
    subsampled_dataset = np.asarray(subsampled_dataset)    
    return subsampled_dataset

In [5]:
subsampled_dataset = construct_dataset()

In [12]:
'''
Extracts features from the subsampled dataset
'''
def extract_features(dataset):
    # Number of features
    aptamer_features = [[], [], [], []]
    peptide_features = [[], [], [], []]
    
    for i in range(dataset.shape[0]):
        flattened = dataset[i].flatten('F')
        all_aptamers = flattened[:samples]
        all_peptides = flattened[samples:]
        
        split = int(0.8*len(all_aptamers))
        all_aptamers = all_aptamers[:split]
        all_peptides = all_peptides[:split]
        
        # Generate the aptamer features randomly
        for j in range(d):
            # Find a random aptamer
            apt = random.choice(all_aptamers)

            # Find a random subsection of k elements from this sequence
            start = random.randint(0, len(apt)-k_apt)
            aptamer_features[i].append(apt[start:start+k_apt])
    
        # Generate the peptide features randomly
        for j in range(d):
            # Find a random aptamer
            pep = random.choice(all_peptides)

            # Find a random subsection of k elements from this sequence
            start = random.randint(0, len(pep)-k_pep)
            peptide_features[i].append(pep[start:start+k_pep])
    
    
    return aptamer_features, peptide_features, split
  

In [13]:
aptamer_features, peptide_features, split = extract_features(subsampled_dataset)

In [14]:
'''
Generates training and testing sets. Training is the first 8000 samples, test is the last 2000 samples. 
'''
def construct_train_test_sets(aptamer_features, peptide_features):
    train_pairs = [[], [], [], []]
    test_pairs = [[], [], [], []]
    
    for c in range(len(subsampled_dataset)):
        train_pairs[c] = subsampled_dataset[c][:split]
        test_pairs[c] = subsampled_dataset[c][split:]
    
    train_pairs = np.asarray(train_pairs)
    test_pairs = np.asarray(test_pairs)
    
    train_aptamers = [[], [], [], []]
    test_aptamers = [[], [], [], []]
    
    train_peptides = [[], [], [], []]
    test_peptides = [[], [], [], []]
    
    # Make a 0/1 matrix for the training aptamers/peptides
    for i in range(len(train_aptamers)):
        pairs = train_pairs[i]
        apt_features = aptamer_features[i]
        pep_features = peptide_features[i]
        
        for j in range(len(pairs)):
            a, p = pairs[j]
            matrix_aptamer = []
            matrix_peptide = []
            
            for k in range(len(apt_features)):
                feat = apt_features[k]
                if feat in a:
                    matrix_aptamer.append(1)
                else:
                    matrix_aptamer.append(0)
            train_aptamers[i].append(matrix_aptamer)
            
            for k in range(len(pep_features)):
                feat = pep_features[k]
                if feat in p:
                    matrix_peptide.append(1)
                else:
                    matrix_peptide.append(0)
            train_peptides[i].append(matrix_peptide)
                
    train_aptamers = np.asarray(train_aptamers)
    train_peptides = np.asarray(train_peptides)
    print("Train Aptamers Shape: ", train_aptamers.shape)
    print("Train Peptides Shape: ", train_peptides.shape)
    
    # Make a 0/1 matrix for the testing aptamers/peptides
    for i in range(len(test_aptamers)):
        pairs = test_pairs[i]
        apt_features = aptamer_features[i]
        pep_features = peptide_features[i]
        
        for j in range(len(pairs)):
            a, p = pairs[j]
            matrix_aptamer = []
            matrix_peptide = []
            
            for k in range(len(apt_features)):
                feat = apt_features[k]
                if feat in a:
                    matrix_aptamer.append(1)
                else:
                    matrix_aptamer.append(0)
            test_aptamers[i].append(matrix_aptamer)
            
            for k in range(len(pep_features)):
                feat = pep_features[k]
                if feat in p:
                    matrix_peptide.append(1)
                else:
                    matrix_peptide.append(0)
            test_peptides[i].append(matrix_peptide)
                
    test_aptamers = np.asarray(test_aptamers)
    test_peptides = np.asarray(test_peptides)
    print("Test Aptamers Shape: ", test_aptamers.shape)
    print("Test Peptides Shape: ", test_peptides.shape)
    
    return train_aptamers, train_peptides, test_aptamers, test_peptides
    
    
    

In [15]:
train_aptamers, train_peptides, test_aptamers, test_peptides = construct_train_test_sets(aptamer_features, peptide_features)

Train Aptamers Shape:  (4, 800, 150)
Train Peptides Shape:  (4, 800, 150)
Test Aptamers Shape:  (4, 200, 150)
Test Peptides Shape:  (4, 200, 150)


## Construct a small neural network

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

In [None]:
# Don't need to reshape the training data, it's for a convolutional layer
# Reshape the training data
train_aptamers_x = []
for i in range(len(train_aptamers)):
    train_aptamers_x.extend(train_aptamers[i])
train_aptamers_x = np.array(train_aptamers_x)

train_peptides_x = []
for i in range(len(train_peptides)):
    train_peptides_x.extend(train_peptides[i])
train_peptides_x = np.array(train_peptides_x)
train_y = np.repeat(np.array([[0, 1], [2, 3]]), 8000)

# Represent the two matrices as a flattened layer


In [17]:
# Define the network
class SmallNN(nn.Module):
    def __init__(self):
        super(SmallNN, self).__init__()
        self.conv_apt = nn.Conv2d(in_channels=4, out_channels=2, kernel_size=3)
        self.conv_pep = nn.Conv2d(in_channels=4, out_channels=2, kernel_size=3)
        self.maxpool = nn.MaxPool2d(2, 2)
        
        #self.fc1 = nn.Linear()
        #self.fc2 = nn.Linear()
       
    def forward(self, apt, pep):
        apt = self.conv_apt(apt)
        apt = self.maxpool(apt)
        
        pep = self.conv_pep(pep)
        pep = self.maxpool(pep)
        
        
        return apt, pep  

In [21]:
model = SmallNN()
optimizier = Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [None]:
for epoch in range(10):
    model.train()
    