# Small Neural Network that takes as input both the aptamer features and the peptide features to predict affinity.

## Generate features for both aptamers and peptides

In [1]:
import json
import random
import numpy as np
from sklearn import linear_model, metrics
from sklearn.svm import SVC
from scipy import stats
import random
import re


In [2]:
'''
Function to classify binding affinity of a sample. 
'''
def classify_affinity(affinity):
    if float(affinity) <= 9:
        return 0
    elif float(affinity) <= 50:
        return 1
    elif float(affinity) <= 400:
        return 2
    return 3

In [21]:
dataset_file = "../data/mhcflurry_dataset.json"
'''
Constructs a dataset that has 10,000 pairs for every class of binding affinity. 
'''
def construct_dataset():
    with open(dataset_file, 'r') as f:
        mhcflurry_data = json.load(f)
    
    # Full dataset. The index of the list corresponds to the binding affinity class
    full_dataset = [[], [], [], []]
    for allele in mhcflurry_data:
        peptides = mhcflurry_data[allele]
        for p, b in peptides:
            affinity_class = classify_affinity(b)
            full_dataset[affinity_class].append((allele, p))
    
    subsampled_dataset = [[], [], [], []]
    
    for i in range(len(full_dataset)):
        full_class = np.asarray(full_dataset[i])
        # Sample 10,000 pairs randomly
        subsampled_dataset[i] = np.copy(full_class[np.random.choice(full_class.shape[0], 10000, replace=False), :])
    
    subsampled_dataset = np.asarray(subsampled_dataset)    
    return subsampled_dataset

In [22]:
subsampled_dataset = construct_dataset()

In [38]:
'''
Extracts features from the subsampled dataset
'''
def extract_features(dataset):
    # Number of features
    d = 1000
    k_apt = 4
    k_pep = 4
    aptamer_features = [[], [], [], []]
    peptide_features = [[], [], [], []]
    
    for i in range(dataset.shape[0]):
        flattened = dataset[i].flatten('F')
        all_aptamers = flattened[:10000]
        all_peptides = flattened[10000:]
        
        # Generate the aptamer features randomly
        for j in range(d):
            # Find a random aptamer
            apt = random.choice(all_aptamers)

            # Find a random subsection of k elements from this sequence
            start = random.randint(0, len(apt)-k_apt)
            aptamer_features[i].append(apt[start:start+k_apt])
    
        # Generate the peptide features randomly
        for j in range(d):
            # Find a random aptamer
            pep = random.choice(all_peptides)

            # Find a random subsection of k elements from this sequence
            start = random.randint(0, len(pep)-k_pep)
            peptide_features[i].append(pep[start:start+k_pep])
    
    
    return aptamer_features, peptide_features
  

In [39]:
aptamer_features, peptide_features = extract_features(subsampled_dataset)