# Program 1: Peptide Classification

- Create feed-forward neural networks and train them using your own codes and
frameworks.
- Experiment with different feature extraction techniques.
- Think about dealing with imbalanced dataset

In [20]:
import numpy as np

np.random.seed(400)

## Import Data

We import the training and test data from the given files

In [21]:
def parse_training_data(file_path):
    x_train = []
    y_train = []
    
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            line = line.strip()
            line = line.split("\t")
            #print(line)
            y_train.append(int(line[0]))
            x_train.append(line[1])
    return x_train, y_train

def parse_testing_data(file_path):
    x_test = []
    
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            line = line.strip()
            #print(line)
            x_test.append(line)
    
    return x_test 

In [22]:
X_train_input, Y_train_input = parse_training_data('train.dat')
X_test_input = parse_testing_data('test.dat')

## Generate KMERs

Generate kmers for train and test data

In [23]:
def kmer(seq, k):
    return [seq[i:i+k] for i in range(0, len(seq)-k+1)]

def kmers(seq, k):
    kmers = []
    for i in range(1,k):
        kmers.extend(kmer(seq, k=i))
    return kmers

In [24]:
X_train_kmers = [kmers(seq, k=3) for seq in X_train_input]
X_test_kmers = [kmers(seq, k=3) for seq in X_test_input]
#print(len(X_train_kmers[0]))

## Encode KMERs

Encode kmers into matrix

In [25]:
def generate_kmer_mapping(train_kmers, test_kmers):
    mapping = {}
    for peptide in train_kmers+test_kmers:
        for amino_acid in peptide:
            if amino_acid not in mapping:
                mapping[amino_acid] = len(mapping)
    return mapping

def encoding_matrix(data, mapping):
    num_rows = len(data)
    num_cols = len(mapping)
    
    matrix = np.zeros((num_rows, num_cols), dtype=float)
    
    for i, peptide in enumerate(data):
        unique_elements, counts = np.unique(peptide, return_counts=True)
        
        for kmer, count in zip(unique_elements, counts):
            if kmer in mapping:
                matrix[i, mapping[kmer]] = count
    
    return matrix

In [26]:
mapping = generate_kmer_mapping(X_train_kmers, X_test_kmers)

X_train_encoded = encoding_matrix(X_train_kmers, mapping)
X_test_encoded = encoding_matrix(X_test_kmers, mapping)

X_train_encoded = np.array(X_train_encoded)
print(len(X_train_encoded[0]))
X_test_encoded = np.array(X_test_encoded)
Y_train_input = np.array(Y_train_input)

436


## Balance Dataset

There is an imbalance of 1 and -1 samples
Need to make copies of 1 samples to balance the dataset

In [27]:
ones = X_train_encoded[Y_train_input == 1]
neg_ones = X_train_encoded[Y_train_input == -1]

#create copies
num_copies = int((neg_ones.shape[0] - ones.shape[0]) / 2)
print(num_copies)
copies = ones[np.random.randint(ones.shape[0], size=num_copies)]

#add copies to labels and values
balanced_X = np.vstack((X_train_encoded, copies))
balanced_Y = np.hstack((Y_train_input, np.ones(num_copies)))

641


## Shuffle Data

In [28]:
shuffle = np.arange(balanced_X.shape[0])
np.random.shuffle(shuffle)

X_train_shuffled = balanced_X[shuffle]
Y_train_shuffled = balanced_Y[shuffle]

## Create Validation Set

In [29]:
half = int(X_train_shuffled.shape[0] / 2)

#Training Set
X_train = X_train_shuffled[half:]
Y_train = Y_train_shuffled[half:]
#print(len(X_train))

#Validation Set
X_test_validation = X_train_shuffled[:half]
Y_test_validation = Y_train_shuffled[:half]

#Test Set
X_test = X_test_encoded

## Methods for Model Definition

In [30]:
class Layer:
    def __init__(self):
        self.input = None

In [31]:
class NeuralLayer(Layer):
    def __init__(self, input, output):
        self.W = np.random.rand(input, output) - 0.5
        self.b = np.random.rand(1, output) - 0.5

    def forward(self, input):
        self.input = input
        return np.dot(self.input, self.W) + self.b

    def backward(self, loss, learning_rate):
        db = loss
        gradient = np.dot(loss, self.W.T)
        dW = np.dot(self.input.T, loss)

        self.W -= learning_rate * dW
        self.b -= learning_rate * db
        return gradient

In [32]:
class ActivationFunctionLayer(Layer):
    def __init__(self, function, function_derivative):
        self.function = function
        self.function_derivative = function_derivative

    def forward(self, input):
        self.input = input
        return self.function(input)

    def backward(self, loss, learning_rate):
        return self.function_derivative(self.input) * loss

In [33]:
def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2

In [34]:
def loss_mse(y, y_hat):
    return np.mean(np.power(y - y_hat, 2))

def loss_mse_derivative(y, y_hat):
    return 2 * (y_hat - y) / y.size

In [35]:
class FeedForward:
    def __init__(self):
        self.layers = []
        self.loss_function = loss_mse
        self.loss_derivative = loss_mse_derivative

    def train(self, X_train, Y_train, epochs, learning_rate):
        for epoch in range(epochs):
            losses = 0
            for i in range(len(X_train)):
                output = X_train[i]
                for layer in self.layers:
                    output = layer.forward(output)

                losses += self.loss_function(Y_train[i], output)

                loss = self.loss_derivative(Y_train[i], output)
                for layer in reversed(self.layers):
                    loss = layer.backward(loss, learning_rate)

            average_loss = losses / len(X_train)
            if epoch % 20 == 0:
                print(f"Epoch {epoch}, Loss {average_loss:.8f}")
            
    def predict(self, X):
        outputs = []
        y_hat = []

        for i in range(len(X)):
            output = X[i]
            for layer in self.layers:
                output = layer.forward(output)
            outputs.append(output)
            
        outputs = np.array([output[0][0] for output in outputs])
        
        y_hat = np.where(outputs >= 0.5, 1, -1)

        return y_hat

## Main Method

Create model layers, train model, and make predictions

In [36]:
if __name__ == "__main__":
    
    #feed in each sample one at a time -> add dimension to data sets
    X_train_matrix = np.expand_dims(X_train, axis=1)
    Y_train_matrix = np.expand_dims(Y_train, axis=1)

    X_test_validation_matrix = np.expand_dims(X_test_validation, axis=1)

    X_test_matrix = np.expand_dims(X_test, axis=1)
    
    ff = FeedForward()
    ff.layers.append(NeuralLayer(436, 64))
    ff.layers.append(ActivationFunctionLayer(tanh, tanh_derivative)) 
    ff.layers.append(NeuralLayer(64, 32))
    ff.layers.append(ActivationFunctionLayer(tanh, tanh_derivative))
    ff.layers.append(NeuralLayer(32, 16))
    ff.layers.append(ActivationFunctionLayer(tanh, tanh_derivative))
    ff.layers.append(NeuralLayer(16, 1))
    ff.layers.append(ActivationFunctionLayer(tanh, tanh_derivative))
    
    ff.train(X_train_matrix, Y_train_matrix, epochs=210, learning_rate=0.01)

    #prediction on validation set
    Y_hat_test_validation = ff.predict(X_test_validation)


Epoch 0, Loss 0.27323106
Epoch 20, Loss 0.00020241
Epoch 40, Loss 0.00007459
Epoch 60, Loss 0.00004519
Epoch 80, Loss 0.00003221
Epoch 100, Loss 0.00002493
Epoch 120, Loss 0.00002028
Epoch 140, Loss 0.00001707
Epoch 160, Loss 0.00001472
Epoch 180, Loss 0.00001292
Epoch 200, Loss 0.00001151
Epoch 220, Loss 0.00001037
Epoch 240, Loss 0.00000943


In [37]:
def mcc(y, y_hat):

    TP = np.sum((y == 1) & (y_hat == 1)) / len(y)
    TN = np.sum((y == -1) & (y_hat == -1)) / len(y)
    FP = np.sum((y == -1) & (y_hat == 1)) / len(y)
    FN = np.sum((y == 1) & (y_hat == -1)) / len(y)
    
    return ((TP * TN) - (FP * FN)) / (np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)))

print("MCC: ", mcc(Y_test_validation, Y_hat_test_validation))
    

MCC:  0.9563547080375059


## Generate Test Set Predictions

In [38]:
# predict
Y_hat_test = ff.predict(X_test_matrix)

#generate test predictions
np.savetxt("predictions.dat", Y_hat_test, fmt="%d")