# Small Neural Network that takes as input both the aptamer features and the peptide features to predict affinity.

## Generate features for both aptamers and peptides + construct training/test sets

In [1]:
import json
import random
import numpy as np
from sklearn import linear_model, metrics
from sklearn.svm import SVC
from scipy import stats
import random
import re


In [2]:
'''
Function to classify binding affinity of a sample. 
'''
def classify_affinity(affinity):
    if float(affinity) <= 9:
        return 0
    elif float(affinity) <= 50:
        return 1
    elif float(affinity) <= 400:
        return 2
    return 3

In [3]:
dataset_file = "../data/mhcflurry_dataset.json"
'''
Constructs a dataset that has 10,000 pairs for every class of binding affinity. 
'''
def construct_dataset():
    with open(dataset_file, 'r') as f:
        mhcflurry_data = json.load(f)
    
    # Full dataset. The index of the list corresponds to the binding affinity class
    full_dataset = [[], [], [], []]
    for allele in mhcflurry_data:
        peptides = mhcflurry_data[allele]
        for p, b in peptides:
            affinity_class = classify_affinity(b)
            full_dataset[affinity_class].append((allele, p))
    
    subsampled_dataset = [[], [], [], []]
    
    for i in range(len(full_dataset)):
        full_class = np.asarray(full_dataset[i])
        # Sample 10,000 pairs randomly
        subsampled_dataset[i] = np.copy(full_class[np.random.choice(full_class.shape[0], 10000, replace=False), :])
    
    subsampled_dataset = np.asarray(subsampled_dataset)    
    return subsampled_dataset

In [4]:
subsampled_dataset = construct_dataset()

In [5]:
'''
Extracts features from the subsampled dataset
'''
def extract_features(dataset):
    # Number of features
    d = 1000
    k_apt = 4
    k_pep = 4
    aptamer_features = [[], [], [], []]
    peptide_features = [[], [], [], []]
    
    for i in range(dataset.shape[0]):
        flattened = dataset[i].flatten('F')
        all_aptamers = flattened[:10000]
        all_peptides = flattened[10000:]
        
        all_aptamers = all_aptamers[:8000]
        all_peptides = all_peptides[:8000]
        
        # Generate the aptamer features randomly
        for j in range(d):
            # Find a random aptamer
            apt = random.choice(all_aptamers)

            # Find a random subsection of k elements from this sequence
            start = random.randint(0, len(apt)-k_apt)
            aptamer_features[i].append(apt[start:start+k_apt])
    
        # Generate the peptide features randomly
        for j in range(d):
            # Find a random aptamer
            pep = random.choice(all_peptides)

            # Find a random subsection of k elements from this sequence
            start = random.randint(0, len(pep)-k_pep)
            peptide_features[i].append(pep[start:start+k_pep])
    
    
    return aptamer_features, peptide_features
  

In [6]:
aptamer_features, peptide_features = extract_features(subsampled_dataset)

In [8]:
'''
Generates training and testing sets. Training is the first 8000 samples, test is the last 2000 samples. 
'''
def construct_train_test_sets(aptamer_features, peptide_features):
    train_pairs = [[], [], [], []]
    test_pairs = [[], [], [], []]
    
    for c in range(len(subsampled_dataset)):
        train_pairs[c] = subsampled_dataset[c][:8000]
        test_pairs[c] = subsampled_dataset[c][8000:]
    
    train_pairs = np.asarray(train_pairs)
    test_pairs = np.asarray(test_pairs)
    
    train_aptamers = [[], [], [], []]
    test_aptamers = [[], [], [], []]
    
    train_peptides = [[], [], [], []]
    test_peptides = [[], [], [], []]
    
    # Make a 0/1 matrix for the training aptamers/peptides
    for i in range(len(train_aptamers)):
        pairs = train_pairs[i]
        apt_features = aptamer_features[i]
        pep_features = peptide_features[i]
        
        for j in range(len(pairs)):
            a, p = pairs[j]
            matrix_aptamer = []
            matrix_peptide = []
            
            for k in range(len(apt_features)):
                feat = apt_features[k]
                if feat in a:
                    matrix_aptamer.append(1)
                else:
                    matrix_aptamer.append(0)
            train_aptamers[i].append(matrix_aptamer)
            
            for k in range(len(pep_features)):
                feat = pep_features[k]
                if feat in p:
                    matrix_peptide.append(1)
                else:
                    matrix_peptide.append(0)
            train_peptides[i].append(matrix_peptide)
                
    train_aptamers = np.asarray(train_aptamers)
    train_peptides = np.asarray(train_peptides)
    print("Train Aptamers Shape: ", train_aptamers.shape)
    print("Train Peptides Shape: ", train_peptides.shape)
    
    # Make a 0/1 matrix for the testing aptamers/peptides
    for i in range(len(test_aptamers)):
        pairs = test_pairs[i]
        apt_features = aptamer_features[i]
        pep_features = peptide_features[i]
        
        for j in range(len(pairs)):
            a, p = pairs[j]
            matrix_aptamer = []
            matrix_peptide = []
            
            for k in range(len(apt_features)):
                feat = apt_features[k]
                if feat in a:
                    matrix_aptamer.append(1)
                else:
                    matrix_aptamer.append(0)
            test_aptamers[i].append(matrix_aptamer)
            
            for k in range(len(pep_features)):
                feat = pep_features[k]
                if feat in p:
                    matrix_peptide.append(1)
                else:
                    matrix_peptide.append(0)
            test_peptides[i].append(matrix_peptide)
                
    test_aptamers = np.asarray(test_aptamers)
    test_peptides = np.asarray(test_peptides)
    print("Test Aptamers Shape: ", test_aptamers.shape)
    print("Test Peptides Shape: ", test_peptides.shape)
    
    return train_aptamers, train_peptides, test_aptamers, test_peptides
    
    
    

In [None]:
train_aptamers, train_peptides, test_aptamers, test_peptides = construct_train_test_sets(aptamer_features, peptide_features)

Train Aptamers Shape:  (4, 8000, 1000)
Train Peptides Shape:  (4, 8000, 1000)
