<a href="https://colab.research.google.com/github/dbosnacki/HelisDeepLearningCourse/blob/main/code/modelTrainTestProteinDomainsRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import csv
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import os
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

#""" Loading, padding and one-hot encoding of the data"""

url = 'https://raw.githubusercontent.com/dbosnacki/HelisDeepLearningCourse/main/cath-domain-description-file-v2_4ProcessedForNN.tsv' 
df = pd.read_csv(url, delimiter = "\t", header=None)

# extract the sequences list from the data frame
sequences = list(df[3])

# find the maximal sequence length
maxSeqLength = 0
for sequence in sequences:
    if len(sequence) > maxSeqLength:
      maxSeqLength = len(sequence)            
    
# padd the sequences with spaces to get equal length
dataset = []

for sequence in sequences:
    dataset.append(list(sequence.ljust(maxSeqLength, ' ')))
    
#one hot encoding of the data
cat = OneHotEncoder()
dataset = cat.fit_transform(dataset).toarray()

labels = list(df[2])    

X = dataset
y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

accuracy_per_fold = []
loss_per_fold = []

fold_no = 1
seed = 10
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

for train_index, val_index in skf.split(X_train, y_train):
    
    print('Fold: ' + str(fold_no))
    

    # Define Sequential model with 3 layers
    model = keras.Sequential(
        [
           layers.Dense(128, input_shape = (len(X_train[0]), ), activation="relu", name="layer1"),
           layers.Dense(64, activation="relu", name="layer2"),
           layers.Dense(3, activation="sigmoid", name="layer3"),
           #
           # layers.Dense(1024, input_shape = (len(X_train[0]), ), activation="relu", name="layer1"),
           # layers.Dense(256, activation="relu", name="layer2"),
           # layers.Dense(64, activation="relu", name="layer3"),
           # layers.Dense(4, activation="relu", name="layer4"),
           # layers.Dense(3, activation="sigmoid", name="layer5"),
           #
           #layers.Dense(128, input_shape = (len(X_train[0]), ), activation="relu", name="layer1"),
           #layers.Dense(64, activation="relu", name="layer2"),
           #layers.Dense(32, activation="relu", name="layer3"),
           #layers.Dense(16, activation="relu", name="layer4"),
           #layers.Dense(8, activation="relu", name="layer5"),
           #layers.Dense(3, activation="relu", name="layer6"),
           #
           #layers.Dense((32), input_shape = (len(X_train[0]), ), activation="relu", name="layer1"),
           #layers.Dense(16, activation="relu", name="layer2"),
           #layers.Dense(3, activation="sigmoid", name="layer3"),
        ]
    )
    
    # model.summary()

    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=['accuracy'],
        optimizer='adam',
    )
    

    history = model.fit(X_train[train_index], 
                        y_train[train_index], 
                        batch_size = 1024, 
                        epochs = 20, 
                        #class_weight = class_weight, 
                        validation_data = (X_train[val_index], y_train[val_index]),
                        #callbacks = callbacks_list,
                        verbose = 2)
    
    scores = model.evaluate(X_test, y_test, verbose=2)
    
    #model.save(newpath + r'\fold-' + str(fold_no) + '.hdf5') 

    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    accuracy_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])

    # Increase fold number
    fold_no = fold_no + 1

# Average scores
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(len(accuracy_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {accuracy_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(accuracy_per_fold)} (+- {np.std(accuracy_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

Fold: 1
Epoch 1/20
60/60 - 2s - loss: 0.8048 - accuracy: 0.6312 - val_loss: 0.5848 - val_accuracy: 0.7616
Epoch 2/20
60/60 - 2s - loss: 0.4606 - accuracy: 0.8250 - val_loss: 0.3927 - val_accuracy: 0.8586
Epoch 3/20
60/60 - 2s - loss: 0.2873 - accuracy: 0.9012 - val_loss: 0.2839 - val_accuracy: 0.9066
Epoch 4/20
60/60 - 2s - loss: 0.1810 - accuracy: 0.9430 - val_loss: 0.2288 - val_accuracy: 0.9287
Epoch 5/20
60/60 - 2s - loss: 0.1168 - accuracy: 0.9650 - val_loss: 0.2139 - val_accuracy: 0.9365
Epoch 6/20
60/60 - 2s - loss: 0.0792 - accuracy: 0.9763 - val_loss: 0.1984 - val_accuracy: 0.9418
Epoch 7/20
60/60 - 2s - loss: 0.0593 - accuracy: 0.9823 - val_loss: 0.2038 - val_accuracy: 0.9442
Epoch 8/20
60/60 - 2s - loss: 0.0460 - accuracy: 0.9864 - val_loss: 0.2122 - val_accuracy: 0.9455
Epoch 9/20
60/60 - 2s - loss: 0.0383 - accuracy: 0.9881 - val_loss: 0.2126 - val_accuracy: 0.9484
Epoch 10/20
60/60 - 2s - loss: 0.0324 - accuracy: 0.9898 - val_loss: 0.2169 - val_accuracy: 0.9493
Epoch 11/20