<a href="https://colab.research.google.com/github/dbosnacki/HelisDeepLearningCourse/blob/main/code/modelTrainTestProteinDomainsWithPairFrequency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import csv
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

def makeSignature(dseqs, ordered = False):
    """ Produce the letter pair occourrence signature of the (amino acid) sequence"""
    
    aminoAcids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L',\
                  'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
    signature = []
        
    if ordered:
        for aa1 in aminoAcids:
            for aa2 in aminoAcids:
                signature.append(dseqs.count(aa1+aa2))
    else:
        for i in range(len(aminoAcids)):
            for j in range(i+1):
                if not (i == j): 
                    signature.append(dseqs.count(aminoAcids[i]+aminoAcids[j]) + dseqs.count(aminoAcids[j]+aminoAcids[i]))
                else:
                    signature.append(dseqs.count(aminoAcids[i]+aminoAcids[j]))
    signature = np.array(signature)       
    return signature

#Loading, padding and one-hot encoding of the data

url = 'https://raw.githubusercontent.com/dbosnacki/HelisDeepLearningCourse/main/cath-domain-description-file-v2_4ProcessedForNN.tsv' 
df = pd.read_csv(url, delimiter = "\t", header=None)

# generate the signature (features) of the sequences based on pair frequencies
sequences = list(df[3])

dataset = []          
        
for sequence in sequences:
    dataset.append(makeSignature(sequence))

#labels = pd.DataFrame(df).to_numpy()
labels = list(df[2])    
   
X = np.array(dataset)
y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

accuracy_per_fold = []
loss_per_fold = []

#save_dir = '\saved_models\\' 
fold_no = 1
seed = 10
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

for train_index, val_index in skf.split(X_train, y_train):
    
    print('Fold: ' + str(fold_no))
    

    # Define Sequential model with 3 layers
    model = keras.Sequential(
        [
           layers.Dense(128, input_shape = (len(X_train[0]), ), activation="relu", name="layer1"),
           layers.Dense(64, activation="relu", name="layer2"),
           layers.Dense(3, activation="sigmoid", name="layer3"),
           #
           # layers.Dense(1024, input_shape = (len(X_train[0]), ), activation="relu", name="layer1"),
           # layers.Dense(256, activation="relu", name="layer2"),
           # layers.Dense(64, activation="relu", name="layer3"),
           # layers.Dense(4, activation="relu", name="layer4"),
           # layers.Dense(3, activation="sigmoid", name="layer5"),
           #
           #layers.Dense(128, input_shape = (len(X_train[0]), ), activation="relu", name="layer1"),
           #layers.Dense(64, activation="relu", name="layer2"),
           #layers.Dense(32, activation="relu", name="layer3"),
           #layers.Dense(16, activation="relu", name="layer4"),
           #layers.Dense(8, activation="relu", name="layer5"),
           #layers.Dense(3, activation="relu", name="layer6"),
           #
           #layers.Dense((32), input_shape = (len(X_train[0]), ), activation="relu", name="layer1"),
           #layers.Dense(16, activation="relu", name="layer2"),
           #layers.Dense(3, activation="sigmoid", name="layer3"),
        ]
    )
    
    # model.summary()

    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=['accuracy'],
        optimizer='adam',
    )

    history = model.fit(X_train[train_index], 
                        y_train[train_index], 
                        batch_size = 1024, 
                        epochs = 25, 
                        #class_weight = class_weight, 
                        validation_data = (X_train[val_index], y_train[val_index]),
                        #callbacks = callbacks_list,
                        verbose = 2)
    
    scores = model.evaluate(X_test, y_test, verbose=2)

    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    accuracy_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])

    # Increase fold number
    fold_no = fold_no + 1

# Average scores
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(len(accuracy_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {accuracy_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(accuracy_per_fold)} (+- {np.std(accuracy_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

Fold: 1
Epoch 1/25
60/60 - 1s - loss: 0.8995 - accuracy: 0.5319 - val_loss: 0.7979 - val_accuracy: 0.5520
Epoch 2/25
60/60 - 0s - loss: 0.7619 - accuracy: 0.5825 - val_loss: 0.7248 - val_accuracy: 0.6386
Epoch 3/25
60/60 - 0s - loss: 0.6331 - accuracy: 0.7217 - val_loss: 0.5539 - val_accuracy: 0.7795
Epoch 4/25
60/60 - 0s - loss: 0.4698 - accuracy: 0.8227 - val_loss: 0.4400 - val_accuracy: 0.8344
Epoch 5/25
60/60 - 0s - loss: 0.3669 - accuracy: 0.8674 - val_loss: 0.3717 - val_accuracy: 0.8691
Epoch 6/25
60/60 - 0s - loss: 0.2977 - accuracy: 0.8994 - val_loss: 0.3275 - val_accuracy: 0.8892
Epoch 7/25
60/60 - 0s - loss: 0.2488 - accuracy: 0.9183 - val_loss: 0.3033 - val_accuracy: 0.8990
Epoch 8/25
60/60 - 0s - loss: 0.2118 - accuracy: 0.9315 - val_loss: 0.2729 - val_accuracy: 0.9130
Epoch 9/25
60/60 - 0s - loss: 0.1824 - accuracy: 0.9431 - val_loss: 0.2629 - val_accuracy: 0.9197
Epoch 10/25
60/60 - 0s - loss: 0.1583 - accuracy: 0.9515 - val_loss: 0.2434 - val_accuracy: 0.9289
Epoch 11/25